def extract_mentions_spans(doc, blacklist=True, debug=False): ''' Extract potential mentions from a spacy parsed Doc ''' if debug: print('===== doc ====:', doc) for c in doc: if debug: print("รฐลธลกยง span search:", c, "head:", c.head, "tag:", c.tag_, "pos:", c.pos_, "dep:", c.dep_) # Named entities mentions_spans = list(ent for ent in doc.ents if ent.label_ in ACCEPTED_ENTS) if debug: print("==-- ents:", list( ((ent, ent.label_) for ent in mentions_spans))) for spans in parallel_process([{ 'doc': doc, 'span': sent, 'blacklist': blacklist } for sent in doc.sents], _extract_from_sent, use_kwargs=True, front_num=0): mentions_spans = mentions_spans + spans spans_set = set() cleaned_mentions_spans = [] for spans in mentions_spans: if spans.end > spans.start and (spans.start, spans.end) not in spans_set: cleaned_mentions_spans.append(spans) spans_set.add((spans.start, spans.end)) return cleaned_mentions_spans
def build_and_gather_multiple_arrays(self, save_path): print("๐ Extracting mentions features") parallel_process(self.docs, set_feats, n_jobs=self.n_jobs) print("๐ Building and gathering arrays") arr = [{'doc': doc, 'i': i} for i, doc in enumerate(self.docs)] arrays_dicts = parallel_process(arr, get_feats, use_kwargs=True, n_jobs=self.n_jobs) gathering_dict = dict((feat, None) for feat in FEATURES_NAMES) n_mentions_list = [] pairs_ant_index = 0 pairs_start_index = 0 for n, p, arrays_dict in tqdm(arrays_dicts): for f in FEATURES_NAMES: if gathering_dict[f] is None: gathering_dict[f] = arrays_dict[f] else: if f == FEATURES_NAMES[6]: array = [a + pairs_ant_index for a in arrays_dict[f]] elif f == FEATURES_NAMES[3]: array = [a + pairs_start_index for a in arrays_dict[f]] else: array = arrays_dict[f] gathering_dict[f] += array pairs_ant_index += n pairs_start_index += p n_mentions_list.append(n) for feature in FEATURES_NAMES[:9]: print("Building numpy array for", feature, "length", len(gathering_dict[feature])) if feature != "mentions_spans": array = np.array(gathering_dict[feature]) if array.ndim == 1: array = np.expand_dims(array, axis=1) else: array = np.stack(gathering_dict[feature]) # check_numpy_array(feature, array, n_mentions_list) print("Saving numpy", feature, "size", array.shape) np.save(save_path + feature, array) for feature in FEATURES_NAMES[9:]: print("Saving pickle", feature, "size", len(gathering_dict[feature])) with open(save_path + feature + '.bin', "wb") as fp: pickle.dump(gathering_dict[feature], fp)
def build_key_file(self, data_path, key_file, debug=False): print("๐ Building key file from corpus") print("Saving in", key_file) # Create a pool of processes. By default, one is created for each CPU in your machine. with io.open(key_file, "w", encoding='utf-8') as kf: if debug: print("Key file saved in", key_file) for dirpath, _, filenames in os.walk(data_path): print("In", dirpath) file_list = [os.path.join(dirpath, f) for f in filenames if f.endswith(".v4_auto_conll") \ or f.endswith(".v4_gold_conll")] cleaned_file_list = [] for f in file_list: fn = f.split('.') if fn[1] == "v4_auto_conll": gold = fn[0] + "." + "v4_gold_conll" if gold not in file_list: cleaned_file_list.append(f) else: cleaned_file_list.append(f) #self.load_file(file_list[0]) doc_list = parallel_process(cleaned_file_list, read_file) for doc in doc_list: kf.write(doc)
def read_corpus(self, data_path, debug=False): print("๐ Reading files") for dirpath, _, filenames in os.walk(data_path): print("In", dirpath, os.path.abspath(dirpath)) file_list = [os.path.join(dirpath, f) for f in filenames if f.endswith(".v4_auto_conll") \ or f.endswith(".v4_gold_conll")] cleaned_file_list = [] for f in file_list: fn = f.split('.') if fn[1] == "v4_auto_conll": gold = fn[0] + "." + "v4_gold_conll" if gold not in file_list: cleaned_file_list.append(f) else: cleaned_file_list.append(f) doc_list = parallel_process(cleaned_file_list, load_file) for docs in doc_list: #executor.map(self.load_file, cleaned_file_list): for utts_text, utt_tokens, utts_corefs, utts_speakers, name, part in docs: print("Imported", name) if debug: print("utts_text", utts_text) print("utt_tokens", utt_tokens) print("utts_corefs", utts_corefs) print("utts_speakers", utts_speakers) print("name, part", name, part) self.utts_text += utts_text self.utts_tokens += utt_tokens self.utts_corefs += utts_corefs self.utts_speakers += utts_speakers self.utts_doc_idx += [len(self.docs_names) ] * len(utts_text) self.docs_names.append((name, part)) print("utts_text size", len(self.utts_text)) print("utts_tokens size", len(self.utts_tokens)) print("utts_corefs size", len(self.utts_corefs)) print("utts_speakers size", len(self.utts_speakers)) print("utts_doc_idx size", len(self.utts_doc_idx)) print("๐ Building docs") for name, part in self.docs_names: self.docs.append( ConllDoc(name=name, part=part, nlp=None, use_no_coref_list=False, consider_speakers=True, embedding_extractor=self.embed_extractor, conll=CONLL_GENRES[name[:2]])) print("๐ Loading spacy model") try: spacy.info('en_core_web_sm') model = 'en_core_web_sm' except IOError: print("No spacy 2 model detected, using spacy1 'en' model") spacy.info('en') model = 'en' nlp = spacy.load(model) print("๐ Parsing utterances and filling docs") doc_iter = (s for s in self.utts_text) for utt_tuple in tqdm( zip(nlp.pipe(doc_iter), self.utts_tokens, self.utts_corefs, self.utts_speakers, self.utts_doc_idx)): spacy_tokens, conll_tokens, corefs, speaker, doc_id = utt_tuple if debug: print(unicode_(self.docs_names[doc_id]), "-", spacy_tokens) doc = spacy_tokens if debug: out_str = "utterance " + unicode_(doc) + " corefs " + unicode_(corefs) + \ " speaker " + unicode_(speaker) + "doc_id" + unicode_(doc_id) print(out_str.encode('utf-8')) self.docs[doc_id].add_conll_utterance( doc, conll_tokens, corefs, speaker, use_gold_mentions=self.use_gold_mentions)