def build_and_gather_multiple_arrays(self, save_path): print("🌋 Extracting mentions features with {} job(s)".format( self.n_jobs)) parallel_process(self.docs, set_feats, n_jobs=self.n_jobs) print("🌋 Building and gathering array with {} job(s)".format( self.n_jobs)) arr = [{'doc': doc, 'i': i} for i, doc in enumerate(self.docs)] arrays_dicts = parallel_process(arr, get_feats, use_kwargs=True, n_jobs=self.n_jobs) gathering_dict = dict((feat, None) for feat in FEATURES_NAMES) n_mentions_list = [] pairs_ant_index = 0 pairs_start_index = 0 for npaidx in tqdm(range(len(arrays_dicts))): try: n, p, arrays_dict = arrays_dicts[npaidx] except: # empty array dict, cannot extract the dict values for this doc continue for f in FEATURES_NAMES: if gathering_dict[f] is None: gathering_dict[f] = arrays_dict[f] else: if f == FEATURES_NAMES[6]: array = [a + pairs_ant_index for a in arrays_dict[f]] elif f == FEATURES_NAMES[3]: array = [a + pairs_start_index for a in arrays_dict[f]] else: array = arrays_dict[f] gathering_dict[f] += array pairs_ant_index += n pairs_start_index += p n_mentions_list.append(n) for feature in FEATURES_NAMES[:9]: feature_data = gathering_dict[feature] if not feature_data: print("No data for", feature) continue print("Building numpy array for", feature, "length", len(feature_data)) if feature != "mentions_spans": array = np.array(feature_data) if array.ndim == 1: array = np.expand_dims(array, axis=1) else: array = np.stack(feature_data) # check_numpy_array(feature, array, n_mentions_list) print("Saving numpy", feature, "size", array.shape) np.save(save_path + feature, array) for feature in FEATURES_NAMES[9:]: feature_data = gathering_dict[feature] if feature_data: print("Saving pickle", feature, "size", len(feature_data)) with open(save_path + feature + '.bin', "wb") as fp: pickle.dump(feature_data, fp)
def extract_mentions_spans(doc, blacklist=True, debug=False): ''' Extract potential mentions from a spacy parsed Doc ''' if debug: print('===== doc ====:', doc) if debug: tablines = [] print("🚧 span search:") for c in doc: tablines.append(["🚧", c, c.head, c.tag_, c.pos_, c.dep_]) # if debug: print("🚧 span search:", c, "head:", c.head, "tag:", c.tag_, "pos:", c.pos_, "dep:", c.dep_) print(tabulate(tablines, headers=["", "Token", "Head", "Tag", "Pos", "Dep"])) # Named entities mentions_spans = list(ent for ent in doc.ents if ent.label_ in ACCEPTED_ENTS) if debug: print("==-- ents:", list(((ent, ent.label_) for ent in mentions_spans))) for spans in parallel_process([{'doc': doc, 'span': sent, 'blacklist': blacklist} for sent in doc.sents], _extract_from_sent, use_kwargs=True, front_num=0): mentions_spans = mentions_spans + spans spans_set = set() cleaned_mentions_spans = [] for spans in mentions_spans: if spans.end > spans.start and (spans.start, spans.end) not in spans_set: cleaned_mentions_spans.append(spans) spans_set.add((spans.start, spans.end)) return cleaned_mentions_spans
def build_key_file(self, data_path, key_file, debug=False): print("🌋 Building key file from corpus") print("Saving in", key_file) # Create a pool of processes. By default, one is created for each CPU in your machine. with io.open(key_file, "w", encoding="utf-8") as kf: if debug: print("Key file saved in", key_file) for dirpath, _, filenames in os.walk(data_path): print("In", dirpath) file_list = [ os.path.join(dirpath, f) for f in filenames if f.endswith(".v4_auto_conll") or f.endswith(".v4_gold_conll") ] cleaned_file_list = [] for f in file_list: fn = f.split(".") if fn[1] == "v4_auto_conll": gold = fn[0] + "." + "v4_gold_conll" if gold not in file_list: cleaned_file_list.append(f) else: cleaned_file_list.append(f) # self.load_file(file_list[0]) doc_list = parallel_process(cleaned_file_list, read_file) for doc in doc_list: kf.write(doc)
def extract_mentions_spans(doc, blacklist, debug=False): """ Extract potential mentions from a spacy parsed Doc """ if debug: print("===== doc ====:", doc) for c in doc: if debug: print( "🚧 span search:", c, "head:", c.head, "tag:", c.tag_, "pos:", c.pos_, "dep:", c.dep_, ) # Named entities mentions_spans = list(ent for ent in doc.ents if ent.label_ in ACCEPTED_ENTS) if debug: print("==-- ents:", list( ((ent, ent.label_) for ent in mentions_spans))) for spans in parallel_process( [{ "doc": doc, "span": sent, "blacklist": blacklist } for sent in doc.sents], _extract_from_sent, use_kwargs=True, front_num=0, ): mentions_spans = mentions_spans + spans spans_set = set() cleaned_mentions_spans = [] for spans in mentions_spans: if spans.end > spans.start and (spans.start, spans.end) not in spans_set: cleaned_mentions_spans.append(spans) spans_set.add((spans.start, spans.end)) return cleaned_mentions_spans
def read_corpus(self, data_path, model=None, debug=False): print("🌋 Reading files") for dirpath, _, filenames in os.walk(data_path): print("In", dirpath, os.path.abspath(dirpath)) file_list = [ os.path.join(dirpath, f) for f in filenames if f.endswith(".v4_auto_conll") or f.endswith(".v4_gold_conll") ] cleaned_file_list = [] for f in file_list: fn = f.split(".") if fn[1] == "v4_auto_conll": gold = fn[0] + "." + "v4_gold_conll" if gold not in file_list: cleaned_file_list.append(f) else: cleaned_file_list.append(f) doc_list = parallel_process(cleaned_file_list, load_file) for docs in doc_list: # executor.map(self.load_file, cleaned_file_list): for ( utts_text, utt_tokens, utts_corefs, utts_speakers, name, part, ) in docs: if debug: print("Imported", name) print("utts_text", utts_text) print("utt_tokens", utt_tokens) print("utts_corefs", utts_corefs) print("utts_speakers", utts_speakers) print("name, part", name, part) self.utts_text += utts_text self.utts_tokens += utt_tokens self.utts_corefs += utts_corefs self.utts_speakers += utts_speakers self.utts_doc_idx += [len(self.docs_names) ] * len(utts_text) self.docs_names.append((name, part)) print("utts_text size", len(self.utts_text)) print("utts_tokens size", len(self.utts_tokens)) print("utts_corefs size", len(self.utts_corefs)) print("utts_speakers size", len(self.utts_speakers)) print("utts_doc_idx size", len(self.utts_doc_idx)) print("🌋 Building docs") for name, part in self.docs_names: self.docs.append( ConllDoc( name=name, part=part, nlp=None, blacklist=self.blacklist, consider_speakers=True, embedding_extractor=self.embed_extractor, conll=CONLL_GENRES[name[:2]], )) print("🌋 Loading spacy model") if model is None: model_options = [ "en_core_web_lg", "en_core_web_md", "en_core_web_sm", "en" ] for model_option in model_options: if not model: try: spacy.info(model_option) model = model_option print("Loading model", model_option) except: print("Could not detect model", model_option) if not model: print("Could not detect any suitable English model") return else: spacy.info(model) print("Loading model", model) nlp = spacy.load(model) print("🌋 Parsing utterances and filling docs with use_gold_mentions=" + (str(bool(self.gold_mentions)))) doc_iter = (s for s in self.utts_text) for utt_tuple in tqdm( zip( nlp.pipe(doc_iter), self.utts_tokens, self.utts_corefs, self.utts_speakers, self.utts_doc_idx, )): spacy_tokens, conll_tokens, corefs, speaker, doc_id = utt_tuple if debug: print(unicode_(self.docs_names[doc_id]), "-", spacy_tokens) doc = spacy_tokens if debug: out_str = ("utterance " + unicode_(doc) + " corefs " + unicode_(corefs) + " speaker " + unicode_(speaker) + "doc_id" + unicode_(doc_id)) print(out_str.encode("utf-8")) self.docs[doc_id].add_conll_utterance( doc, conll_tokens, corefs, speaker, use_gold_mentions=self.gold_mentions)