def __iter__(self): for filename in self.file_list[0:1]: sent_file = os.path.join(self.annotation_dir, filename) with open(sent_file) as file: lc = LoopTimer(update_after=100) lastid = None for line in file: if self.print_status: lc.update("Posbigram Sent Stream") data = json.loads(line) xml = data['annotation'] id = data['id'] if lastid != id: para_num = 0 else: para_num += 1 lastid = id token_list = mf.xml2words(xml) pos_list = mf.xml2pos(xml) for i in range(0, len(token_list)): token_cleaned, pos_cleaned = utils.posFilterString(token_list[i], pos_list[i]) if len(token_cleaned) > 0: yield id, para_num, utils.makeBigrams(pos_cleaned)
def __iter__(self): for filename in self.file_list[0:1]: sent_file = os.path.join(self.annotation_dir, filename) with open(sent_file) as file: lc = LoopTimer(update_after=100) abs_list = [] lastid = None for line in file: if self.print_status: lc.update("Lemma Doc Stream") data = json.loads(line) doc_id = data['id'] xml = data['annotation'] if lastid != doc_id and len(abs_list) > 0: # Yield Stuff yield lastid, abs_list abs_list = [] lastid = doc_id token_list = mf.xml2lemmas(xml) pos_list = mf.xml2pos(xml) for i in range(0, len(token_list)): token_cleaned, pos_cleaned = utils.posFilterString(token_list[i], pos_list[i]) if len(token_cleaned) > 0: for j in range(0, len(token_cleaned)): abs_list.append(token_cleaned[j]) if len(abs_list) > 0: # Yield Stuff yield lastid, abs_list
def __iter__(self): for filename in self.file_list[0:1]: sent_file = os.path.join(self.annotation_dir, filename) with open(sent_file) as file: lc = LoopTimer(update_after=100) for line in file: if self.print_status: lc.update("Lemma Para Stream") data = json.loads(line) doc_id = data['id'] para_id = data['paragraphID'] xml = data['annotation'] token_list = mf.xml2lemmas(xml) pos_list = mf.xml2pos(xml) para_list = [] for i in range(0, len(token_list)): token_cleaned, pos_cleaned = utils.posFilterString(token_list[i], pos_list[i]) if len(token_cleaned) > 0: for j in range(0, len(token_cleaned)): para_list.append(token_cleaned[j]) yield doc_id, para_id, para_list
def nlp_to_doc_token(annotation, token_type, clean=True, lower=False, bigrams=False, dictionary=None): sentences = annotation['sentences'] abs_list = list() for sentence in sentences: pos_list = list() token_list = list() for token in sentence['tokens']: pos_list.append(token['pos']) # oText = token['originalText'] if lower: token_list.append(token[token_type].lower()) else: token_list.append(token[token_type]) if clean: token_list, pos_cleaned = utils.posFilterString(token_list, pos_list) if dictionary is not None: token_list = [word for word in token_list if word in dictionary.token2id] if bigrams: token_list = utils.makeBigrams(token_list) abs_list.extend(token_list) return abs_list