Beispiel #1
0
 def __call__(self, node_dict, node_tree):
     """Detects parts of noun-phrases via CPL"""
     noun_parts = {}  # node => list of parts
     have_parent = set()
     for part in [n for n in node_dict.values() if n.tag in PROP_TAGS]:
         dep_parent = node_tree.parent(part)
         if dep_parent is not None:
             dep, parent = dep_parent
             if all([
                     dep == Dep.CPL,
                     not node_tree.has_child_via_set(part, MARKER_DEPS),
                     part.idx < parent.idx, parent.tag in NOUN_TAGS
             ]):
                 append_to_dict_list(noun_parts, parent, part)
                 have_parent.add(part)
     # Register chunks
     flattened = set()
     for root in [
             n for n in node_dict.values()
             if n in noun_parts and n not in have_parent
     ]:
         flattened |= _chunk(root, noun_parts, self.__form_eids)
     # Clean nodes
     return {
         idx: node
         for idx, node in node_dict.items() if node not in flattened
     }
Beispiel #2
0
 def __init__(self, ngrams, last_order, nb_processes):
     ngrams_dict = {}  # order => list of ngrams
     self.__total_counts = {order: 0 for order in range(1, last_order + 1)}  # order => total_count
     for ngram in ngrams:
         self.__total_counts[ngram.order] += ngram.count
         append_to_dict_list(ngrams_dict, ngram.order, ngram)
     super().__init__(ngrams_dict, last_order, nb_processes)
 def __init__(self):
     """Pass the min count for each order needed.
     Examples:
       * If only unigrams are needed with min count 1, instantiate LanguageModel(1)
       * To get unigrams with mincount 10 and bigrams with mincount 5, instantiate LanguageModel(10, 5)
     """
     self.__max_order = len(self.__min_counts)
     self.__ngrams_details = {}  # ngram => tuple (count, proba, logp)
     self.__unk = (0, 0.0, float('-inf')
                   )  # Unknown ngram (count=0, proba=0.0, logp=-inf)
     self.__tokenize = Tokenizer()
     # Load ngrams
     for idx, min_count in enumerate(self.__min_counts):
         order = idx + 1
         for text, count, proba in load_pkl_file(
                 cfg.DATA_DIR / 'langmodel' / ('%d_grams.pkl' % order)):
             if count >= min_count:
                 self.__ngrams_details[text] = (count, proba,
                                                math.log(proba))
     # Build next_token dict
     self.__next_tokens = {
     }  # ngram => list of tuples (token, proba) ordered by descending proba
     if self.__max_order > 1:
         for ngram, details in self.__ngrams_details.items():
             tokens = ngram.split()
             if len(tokens) > 1:
                 append_to_dict_list(self.__next_tokens,
                                     ' '.join(tokens[:-1]),
                                     (tokens[-1], details[1]))
     for ngram, next_tokens in self.__next_tokens.items():
         next_tokens.sort(key=lambda x: x[1], reverse=True)
Beispiel #4
0
def _prepare_mkn(ngrams, last_order):
    """Calculate all counts necessary prior to smoothing"""
    lm_ngrams = {}  # order => list of ngrams
    wcs = collections.Counter()
    wc1 = collections.Counter()
    wc2 = collections.Counter()
    wc3 = collections.Counter()
    n1 = collections.Counter()  # Number of ngrams with count 1
    n2 = collections.Counter()  # Number of ngrams with count 2
    n3 = collections.Counter()  # Number of ngrams with count 3
    n4 = collections.Counter()  # Number of ngrams with count 4
    for ngram in ngrams:
        # n counts
        if ngram.count == 1:
            n1[ngram.order] += 1
        elif ngram.count == 2:
            n2[ngram.order] += 1
        elif ngram.count == 3:
            n3[ngram.order] += 1
        elif ngram.count == 4:
            n4[ngram.order] += 1
        # Wildcards
        if ngram.order > 1:
            wcs['%s %s' % ('•', ' '.join(ngram.tokens[1:]))] += 1  # • ngram
            wc = '%s %s' % (' '.join(ngram.tokens[:-1]), '•')  # ngram •
            if ngram.count == 1:
                wc1[wc] += 1
            elif ngram.count == 2:
                wc2[wc] += 1
            else:
                wc3[wc] += 1
        if ngram.order > 2:
            wcs['%s %s %s' %
                ('•', ' '.join(ngram.tokens[1:-1]), '•')] += 1  # • ngram •
        append_to_dict_list(lm_ngrams, ngram.order, ngram)
    # Calculage discounting values
    d1 = {}
    d2 = {}
    d3 = {}
    for order in range(2, last_order + 1):
        if any(
            [n1[order] == 0, n2[order] == 0, n3[order] == 0, n4[order] == 0]):
            raise ModifiedKneserNeyNotEnoughDataError()
        else:
            y = n1[order] / (n1[order] + 2. * n2[order])
            d1[order] = 1. - (2. * y * n2[order] / n1[order])
            d2[order] = 2. - (3. * y * n3[order] / n2[order])
            d3[order] = 3. - (4. * y * n4[order] / n3[order])
    return lm_ngrams, wcs, wc1, wc2, wc3, d1, d2, d3
Beispiel #5
0
def extract_classes(resources_dir):
    """Extracts entity's classes from knowledge base"""
    reports_dir = cfg.REPORTS_DIR / 'extract_classes'
    timer = Timer()
    print()

    # EXTRACT URIs & THEIR ENTITY ID
    # -------------------------------------------------------------------------->

    print('Extracting entity IDs...')
    re_wikidata_uri = re.compile(r'<http://www.wikidata.org/entity/Q(\d+)>')
    with txt_file_reader(pathlib.Path(resources_dir) /
                         'yago-wd-sameAs.nt') as data:
        yago_eid = {}  # yago_uri => entity id (numerical string)
        for line in data:
            yago_uri, _, same_as, _ = line.strip().split('\t')
            match = re.fullmatch(re_wikidata_uri, same_as)
            if match:
                yago_eid[yago_uri] = match.group(1)
    with txt_file_writer(reports_dir / 'uri_eid.tx') as report:
        for yago_uri, eid in yago_eid.items():
            report.write('%s\t%s\n' % (yago_uri, eid))
    print('{:,} URIs extracted in {}'.format(len(yago_eid), timer.lap_time))
    print()

    # GENERATE CLASS IDs
    # -------------------------------------------------------------------------->

    print('Extracting class IDs...')
    with txt_file_reader(pathlib.Path(resources_dir) /
                         'yago-wd-schema.nt') as data:
        uri_cid = {}  # schema URI => class ID
        class_id = 1
        for line in data:
            class_uri, _, _, _ = line.strip().split('\t')
            uri = class_uri[1:-1]
            if uri not in uri_cid and uri.startswith('http'):
                uri_cid[uri] = class_id
                class_id += 1
    with txt_file_writer(reports_dir / 'uri_cid.txt') as report:
        for uri, cid in uri_cid.items():
            report.write('%s\t%d\n' % (uri, cid))
    print('Extracted {:,} classes in {}'.format(len(uri_cid), timer.lap_time))
    print()

    # EXTRACT INSTANCE_OF RELATIONS
    # -------------------------------------------------------------------------->

    print('Extracting types...')
    with txt_file_reader(
            pathlib.Path(resources_dir) / 'yago-wd-simple-types.nt') as data:
        eid_cids = {}  # entity id => list of class ids
        for line in data:
            yago_uri, _, instance_of, _ = line.strip().split('\t')
            eid = yago_eid.get(yago_uri, None)
            if eid is not None:
                append_to_dict_list(eid_cids, int(eid),
                                    uri_cid[instance_of[1:-1]])
    with txt_file_writer(reports_dir / 'eid_cids.txt') as report:
        for eid, types in eid_cids.items():
            report.write('%d\t%s\n' % (eid, str(types)))
    print('{:,} entities assigned in {}'.format(len(eid_cids), timer.lap_time))
    print()

    # ALL DONE
    # -------------------------------------------------------------------------->

    print('All done in {}'.format(timer.total_time))
Beispiel #6
0
 def register_warning(self, token_idx, comment, focus_idxs):
     warning = {'comment': comment, 'focusIdxs': focus_idxs}
     append_to_dict_list(self.__warnings, token_idx, warning)