def index(client, freq_file, lang): tweets = client['twitter_'+lang]['tweets'] # freq_dict = load(freq_file) freq_dict = defaultdict(int) i = 0 for tweet in tweets.find(): i += 1 if i % 100000 == 0: print i # tweets.update({'_id': tweet['_id']}, {'$set': {'indexed': True}}) # text = tweet['text'].lower() text = tweet['text'] text = re.sub(filter_pattern, '', text) for sent in split_multi(text): for word in word_tokenizer(sent): freq_dict[word] += 1 # for the second db (tr) tweets = client['new_'+lang]['tweets'] for tweet in tweets.find(): i += 1 if i % 100000 == 0: print i # tweets.update({'_id': tweet['_id']}, {'$set': {'indexed': True}}) # text = tweet['text'].lower() text = tweet['text'] text = re.sub(filter_pattern, '', text) for sent in split_multi(text): for word in word_tokenizer(sent): freq_dict[word] += 1 save(freq_file, freq_dict)
def tokenize(text, segment=True, norm=True, unique=False, min_len=2, max_sent=0): ''' Tokenize text using SegTok segmenter and tokenizer. ''' sentences = split_multi(text) if segment else [text] tokens = [] for i, s in enumerate(sentences): if max_sent and i >= max_sent: break tokens += word_tokenizer(s) if unique: tokens = list(set(tokens)) if min_len: tokens = [t for t in tokens if len(t) >= min_len] if norm: tokens = [w for t in tokens for w in normalize(t).split()] return tokens
def tokenize_text(doc_text): '''Tokenize the text and preserve offsets''' # Split text into sentences using segtok, then words and create Token objects sents = [sent for sent in split_multi(doc_text) if sent.strip() != ""] doc_tokens = [] current_offset = 0 for sent in sents: sent_tokens = [] words = re.split(SPLIT_REGEX, sent) words = [word.strip() for word in words if word.strip() != ""] for word in words: word_offset = doc_text.index(word, current_offset) current_offset = word_offset + len(word) word = unidecode(word) sent_token = Token(word, word_offset, word_offset + len(word), TOKEN_O) sent_tokens.append(sent_token) if sent_tokens: sent_start = sent_tokens[0].start sent_end = sent_tokens[-1].end # Update sentence offsets for token in sent_tokens: token.sent_start = sent_start token.sent_end = sent_end doc_tokens.append(sent_tokens) return doc_tokens
def __build_graph__(self): stopwords = get_stopwords(self.lan) stem = get_stem(self.lan).stem self.G = nx.Graph() sentences_str = [[ w for w in split_contractions(web_tokenizer(s)) if not (w.startswith("'") and len(w) > 1) and len(w) > 0 ] for s in list(split_multi(self.text)) if len(s.strip()) > 0] for sentence in sentences_str: buffer = [] for word in sentence: if len([ c for c in word if c in EXCLUDE ]) == len(word) or word.lower() in stopwords or word.replace( '.', '').replace(',', '').replace('-', '').isnumeric(): continue else: #stemmed_word = lemma(word).lower() stemmed_word = stem(word) if stemmed_word not in self.G: self.G.add_node(stemmed_word, TF=0) self.G.node[stemmed_word]['TF'] += 1 for (idx_cooccur, word_cooccur) in enumerate(buffer[-self.w:]): self.__add_cooccur__(word_cooccur, stemmed_word, idx_cooccur + 1) buffer.append(stemmed_word) self.__build_linegraph__()
def single_extraction(sentences): with open('config/logging_config.yaml') as f: logging.config.dictConfig(yaml.load(f)) logger = logging.getLogger('single_relation_extraction') parser_server = 'http://127.0.0.1:8084' for sent in split_multi(sentences): sent = sent.strip() if sent: logger.debug(u'SENTENCE: {}'.format(sent)) try: extractor = RelationExtractor(sent, parser_server, logger, entity_linking_flag=False) except: logger.error(u'Failed to parse the sentence', exc_info=True) else: extractor.extract_spo() for relation in extractor.relations: logger.debug(u'SUBJECT HEAD: {}'.format(relation.subject.head)) logger.debug(u'SUBJECT NN HEAD: {}'.format(relation.subject.nn_head)) if extractor.entity_linking_flag: logger.debug(u'SUBJECT EL: {}'.format(relation.subject_el)) logger.debug(u'OBJECT HEAD: {}'.format(relation.object.head)) logger.debug(u'OBJECT NN HEAD: {}'.format(relation.object.nn_head)) if extractor.entity_linking_flag: logger.debug(u'OBJECT EL: {}'.format(relation.object_el)) logger.debug(u'RELATION LEMMA: {}'.format(relation.lemma)) logger.debug(u'RELATION CANONICAL: {}'.format(relation.canonical_form))
def add_document(self, text): text = self.pre_filter(text) sentences_str = [[ w for w in split_contractions(web_tokenizer(s)) if not (w.startswith("'") and len(w) > 1) and len(w) > 0 ] for s in list(split_multi(text)) if len(s.strip()) > 0] self.number_of_sentences += len(sentences_str) self.number_of_documents += 1 pos_text = 0 document_candidates = {} term_in_doc = {} sentences_obj = [] block_of_word_obj = [] sentence_obj_aux = [] for (sentence_id, sentence) in enumerate(sentences_str): sentence_obj_aux = [] block_of_word_obj = [] for (pos_sent, word) in enumerate(sentence): if len([ c for c in word if c in self.exclude ]) == len(word): # If the word is based on exclude chars if len(block_of_word_obj) > 0: sentence_obj_aux.append(block_of_word_obj) cand = ComposedWord(block_of_word_obj) cand = self.add_or_update_composed_word(cand) if cand.unique_kw not in document_candidates: document_candidates[cand.unique_kw] = cand block_of_word_obj = [] else: tag = self.get_tag(word, pos_sent) term_obj = self.get_term(word) term_in_doc[term_obj.unique_term] = term_obj term_obj.add_occurrence(tag, sentence_id, pos_sent, pos_text, self.number_of_documents) pos_text += 1 #Create co-occurrence matrix if tag not in self.tagsToDiscard: word_windows = list( range( max(0, len(block_of_word_obj) - self.windowsSize), len(block_of_word_obj))) for w in word_windows: if block_of_word_obj[w][ 0] not in self.tagsToDiscard: self.add_cooccurrence(block_of_word_obj[w][2], term_obj) # Add term to the block of words' buffer block_of_word_obj.append((tag, word, term_obj)) if len(block_of_word_obj) > 0: sentence_obj_aux.append(block_of_word_obj) if len(sentence_obj_aux) > 0: sentences_obj.append(sentence_obj_aux) if len(block_of_word_obj) > 0: sentence_obj_aux.append(block_of_word_obj) if len(sentence_obj_aux) > 0: sentences_obj.append(sentence_obj_aux) self.number_of_words += pos_text return document_candidates, term_in_doc
def parse(self): if self.error: return for count, sentence in enumerate(split_multi(self.result["text"])): self.result["sentences"][count] = {"string": sentence, "pos": [], "sentiment": [], "stanford": [], "count": count} if self.use_threads: self._threaded_parser() else: self._parser() if self.use_stats: self.stats_all() self.result["ners"] = [] tmp_ners = [] #TODO: integrate the results from pp for item in [self.result.get('sentences').get(s).get('stanford') for s in self.result.get('sentences')]: if item and item.get('ners'): for ner in item.get('ners'): if ner.get('tag') in ['person', 'per']: self.result["ners"].append(ner.get('string'))
def extract_from_txt(input_dir): if not input_dir.endswith('/'): input_dir += '/' file_dir = input_dir + 'processed/' output_dir = input_dir + 'verbs/' if not os.path.exists(os.path.dirname(output_dir)): os.makedirs(os.path.dirname(output_dir)) extractor = VerbExtractor() for root, _, files in os.walk(file_dir): for fn in files: if fn.endswith('.txt'): filename = os.path.join(root, fn) logging.info('Reading {}'.format(filename)) f_in = codecs.open(filename, encoding='utf-8') text = f_in.read() results = {} for sent in split_multi(text): verbs = extractor.extract(sent.lower()) if verbs: for verb, category in verbs: results.setdefault(category, []).append(verb) f_in.close() output_file = filename.replace('processed', 'verbs').replace('.txt', '_verbs.txt') logging.info('Writing to {}'.format(output_file)) f_out = codecs.open(output_file, mode='w', encoding='utf-8') for category in extractor.bloom_verb_categories: counter = Counter(results[category]) output_str = [u'{}({})'.format(item[0], str(item[1])) for item in counter.items()] f_out.write(u'{}: {}\n'.format(category, ', '.join(output_str))) f_out.close()
def sentence_segmentation(text): '''gets a text (string), returns list of sentences. Every item of a list should be one and only one whole sentence. (But in fact there are errors.) As a 'sentence' we define a sequence of words carrying one whole thought. E.g. the text on this line before "E.g." was one sentence. Compound sentence is also 'one sentence'. 'One sentence' is also a title (beware, they are usually not ended by full stop) or a menu item. This function uses segmentation from `segtok` library, but it improves it by one additional rule. It tries to split also sentences that are not ended by dot and space, if the dot separates two existing English words present in dictionary. ''' sentences = list(segmenter.split_multi(text)) out_sentences = [] for s in sentences: sen_words = s.split() current_sentence = [] for w in sen_words: if not "http" in w and reg.match(w): a = re.split(r"(\)?[.!?:])\(?",w) if all(x.lower() in words or (x.lower().endswith("s") and x[:-1] in words) for x in a[:2:2]): current_sentence.append(a[0]+a[1]) out_sentences.append(" ".join(current_sentence)) current_sentence = ["".join(a[2:])] else: current_sentence.append(w) else: current_sentence.append(w) out_sentences.append(" ".join(current_sentence)) return out_sentences
def split(self, text: str) -> List[Sentence]: plain_sentences: List[str] = list(split_multi(text)) try: sentence_offset: Optional[int] = text.index(plain_sentences[0]) except ValueError as error: raise AssertionError( f"Can't find the sentence offset for sentence {repr(plain_sentences[0])} " f"from the text's starting position") from error sentences: List[Sentence] = [] for sentence, next_sentence in stagger(plain_sentences, offsets=(0, 1), longest=True): sentences.append( Sentence(text=sentence, use_tokenizer=self._tokenizer, start_position=sentence_offset)) offset: int = sentence_offset + len(sentence) try: sentence_offset = text.index( next_sentence, offset) if next_sentence is not None else None except ValueError as error: raise AssertionError( f"Can't find the sentence offset for sentence {repr(sentence)} " f"starting from position {repr(offset)}") from error return sentences
def tokenize(self, text): """ tokenize the text and filter the @username (and punctuation, smiley ...), leave only words """ words = [] # list of words # text = text.decode('utf-8') text = filter_pattern.sub(' ', text) for sent in split_multi(text): for token in word_tokenizer(sent): words.append(token.encode('utf-8', 'ignore')) return words
def sentence_prepare(text, vectorizer, sent_len, doc_len, unique=False): #from nltk.tokenize import sent_tokenize from segtok.segmenter import split_multi vocab = vectorizer.vocabulary_ tokenizer = vectorizer.build_tokenizer() #text = [sent_tokenize(doc) for doc in text ] text = [list(split_multi(doc)) for doc in text] seq = [] sent_l = [] doc_l = [] for doc in text: doc_tok = [] for sent in doc: sent_toks = [vocab[y] for y in tokenizer(sent) if y in vocab] doc_tok.append(sent_toks) sent_l.append(len(sent_toks)) seq.append(doc_tok) doc_l.append(len(doc_tok)) sent_l = np.array(sent_l) doc_l = np.array(doc_l) print("Average Sent Length: ", sent_l.mean()) print("90% Length: ", np.percentile(sent_l, 90)) print("Average Doc Length: ", doc_l.mean()) print("90% Length: ", np.percentile(doc_l, 90)) #sent_len = np.percentile(sent_l, 90) #doc_len = np.percentile(doc_l, 90) padded_docs = torch.zeros(len(seq), doc_len, sent_len) for i, _doc in enumerate(seq): if len(_doc) > doc_len: _doc = _doc[:doc_len] padded_seq = pad_doc(_doc, sent_len, len(_doc)) else: if len(_doc) == 0: continue padded_seq = pad_doc(_doc, sent_len, doc_len) padded_docs[i] = padded_seq return padded_docs
def tokenize_old(output_file, db = 'crawler'): client = MongoClient() texts = client[db]['texts'] f = open(output_file, 'w') # (TODO: some query to get specific data) for entry in texts.find(): text = entry['text'].decode('utf-8', 'ignore') # (optional: write article level data) for sent in split_multi(text): for token in word_tokenizer(sent): f.write('%s\t%s\n' % (token.encode('utf-8', 'ignore'), 'X')) f.write('\n') f.close()
def preprocessing(self): #dévisé le texte entre phrases sentences = split_multi(self.text) #obtenir les morseaux de texte self.Chuncks = self.chunks(sentences) self.chunckDict = [] wordDict = dict() for chunck in self.Chuncks: wordDict = dict() for word in range(len(chunck)): #get the the set of words if chunck[word].lower() not in self.tokens: self.tokens[chunck[word].lower()] wordDict[chunck[word]] = self.getNameTag(chunck[word], word) self.chunckDict.append(wordDict)
def read(origin_file, freq_file, lang): freq_dict = defaultdict(int) i = 0 for line in open(origin_file): i += 1 if i % 100000 == 0: print i items = line.strip().split(',', 3) if len(items) == 4 and items[0] == lang: # text = items[3].lower().decode('utf-8') text = items[3].decode('utf-8') text = re.sub(filter_pattern, '', text) for sent in split_multi(text): for word in word_tokenizer(sent): freq_dict[word] += 1 save(freq_file, freq_dict)
def read_sentences_from_file(path_to_file, one_sentence_per_line=True): lines = [] with io.open(path_to_file, mode="r", encoding="utf-8") as file: for line in file: line = line.strip() if line != "": lines.append(line.strip()) if one_sentence_per_line: sentences = lines else: text = " ".join(lines) sentences = list(split_multi(text)) sentences = [sentence for sentence in sentences if sentence != ""] return sentences
def tokenize_on_date(output_file, date = '2015-07-05'): client = MongoClient() texts = client['crawler']['texts'] f = open(output_file, 'w') # (TODO: some query to get specific data) for entry in texts.find({'date': date}): text = entry['text'].decode('utf-8', 'ignore') # (optional: write article level data) for sent in split_multi(text): for token in word_tokenizer(sent): if re.search("'s$", token): f.write('%s\t%s\n' % (token[:-2].encode('utf-8', 'ignore'), 'X')) f.write('%s\t%s\n' % (token[-2:].encode('utf-8', 'ignore'), 'X')) else: f.write('%s\t%s\n' % (token.encode('utf-8', 'ignore'), 'X')) f.write('\n') f.close()
def tokenize(self, tweets): """ tokenize the text and filter the @username (and punctuation, smiley ...), leave only words """ counts = [] # [5, 12, 0, 3, ...] the counts of valid words for each tweet words = [] # list of words # out = '' # one-word-per-line string of the tokenized words for morph analysis for (text, tid, uid) in tweets: i = 0 text = filter_pattern.sub(' ', text) for sent in split_multi(text): for token in word_tokenizer(sent): # words.append(token.lower().encode('utf-8', 'ignore')) words.append(token.encode('utf-8', 'ignore')) i += 1 counts.append(i) return words, counts
def tokenize_document(doc_path): '''Tokenize the text and preserve offsets''' with codecs.open(doc_path, 'r', "utf-8") as myfile: doc_text = myfile.read() # Split text into sentences using segtok, then words and create Token objects sents = [sent for sent in split_multi(doc_text) if sent.strip() != ""] doc_tokens = [] current_offset = 0 for sent in sents: sent_tokens = [] words = re.split(SPLIT_REGEX, sent) words = [word.strip() for word in words if word.strip() != ""] for word in words: word_offset = doc_text.index(word, current_offset) current_offset = word_offset + len(word) sent_token = Token(word, word_offset, word_offset + len(word), TOKEN_O) sent_tokens.append(sent_token) doc_tokens.append(sent_tokens) return doc_tokens, doc_text
def tokenize(self, tweets): """ tokenize the text and filter the @username (and punctuation, smiley ...), leave only words """ counts = [ ] # [5, 12, 0, 3, ...] the counts of valid words for each tweet words = [] # list of words # out = '' # one-word-per-line string of the tokenized words for morph analysis for (text, tid, uid) in tweets: i = 0 text = filter_pattern.sub(' ', text) for sent in split_multi(text): for token in word_tokenizer(sent): # words.append(token.lower().encode('utf-8', 'ignore')) words.append(token.encode('utf-8', 'ignore')) i += 1 counts.append(i) return words, counts
def __iter__(self): for root, _, files in os.walk(self._raw_text_dir): for fn in files: if fn.endswith('.txt'): filename = os.path.join(root, fn) f = codecs.open(filename, encoding='utf-8') text = f.read() if text: text = self.process_text(text) for sent in split_multi(text): # Discard very long and very short sentences if sent and len(sent) < 1000 and len(sent.split()) > 2: sent = sent.strip() yield filename, sent f.close() # Move the preprocessed files to a temp directory, so we know which files are done. done_filename = filename.replace('/raw/', '/raw_preprocessed/') if not os.path.exists(os.path.dirname(done_filename)): os.makedirs(os.path.dirname(done_filename)) os.rename(filename, done_filename)
def Features_computation(self): for word in self.tokens: Tfa = self.get_word_tags(word, "a") TfU = self.get_word_tags(word, "U") #calcule TCase self.termsCalcule[word].TCase = max( Tfa, TfU) / math.log(1 + math.log(self.terms[word].TF)) #TPos self.termsCalcule[word].TPos = math.log( 3 + median(self.terms[word].offsets_sentences)) #TFNorm validTFs = [ self.terms[term].TF for term in self.tokens if not self.stopWord ] avgTF = mean(validTFs) stdTF = stdev(validTFs) self.termsCalcule[word].TFNorm = self.terms[word].TF / (avgTF + stdTF) #len( split_multi(self.text)) self.termsCalcule[ word].TSent = self.terms[word].offsets_sentences / len( split_multi(self.text)) #TRel maxTF = max([self.terms[term].TF for term in self.tokens]) try: DL = self.calcule_DL(self.cooccur, self.cooccur(word))[1] / self.calcule_DL( self.cooccur, self.cooccur(word))[0] except: DL = 0 try: DR = self.calcule_DR(self.cooccur, self.cooccur(word))[1] / self.calcule_DR( self.cooccur, self.cooccur(word))[0] except: DR = 0 self.termsCalcule[word].TRel = 1 + (DL + DR) * ( self.terms[word].TF / maxTF)
def clean_int(sents): introduction = [] paragraph = [] for sent in sents: sent = sent.lower() sent = re.sub('[~|\'|\"|``|\t|\n]', ' ', sent) sent = re.sub('{.*}', ' @cite', sent) sent = re.sub('\(.*\)', ' @remark', sent) sent = re.sub('\[.*\]', ' @cite', sent) sent = re.sub('e\s*\.g\s*\.\s*,', ' e.g., ', sent) sent = re.sub('e\s*\.g\s*\.\s*', ' e.g., ', sent) sent = re.sub('etc\s*\.', ' etc. ', sent) sent = re.sub('et\s*al\s*\.\s*,', ' et al., ', sent) sent = re.sub('i\s*\.e\s*\.\s*,', ' i.e., ', sent) sent = re.sub('[;|:]', '. ', sent) sent = re.sub(',[\s|,]*,', ', ', sent) sent = re.sub('\s*\.\s*', '. ', sent) tmp = split_multi(sent) res = [] for each in tmp: if 'i.e' not in each and 'e.g' not in each: x = sent_tokenize(each) for y in x: y = y.split() if len(y) < 5: continue y = ' '.join(y) res.append(y) continue each = each.split() if len(each) < 5: continue each = ' '.join(each) res.append(each) paragraph.append(len(res)) introduction.extend(res) return introduction, paragraph
def compute_term_statistics(self): sentencesL = sent_tokenize(self.text) sentencesL = list(map(lambda sente: sente.lower(), sentencesL)) sentences = split_multi(self.text) chuncks = self.chunks(sentences) for chunck in chuncks: for word in range(len(chunck)): if chunck[word] not in self.stopWord and len(chunck[word]) > 3: # calcule the TF of word self.terms[chunck[word]].TF += 1 # calcule the sum of position of sentence where word existe self.terms[chunck[ word]].offsets_sentences = self.__getSumIndexSents( sentencesL, chunck[word]) for j in range(self.window): try: if (chunck[word], chunck[j - word]) not in self.cooccur.keys(): self.cooccur[(chunck[word], chunck[j - word])] = 0 elif self.occurance(chunck, chunck[word], chunck[j - word]): self.cooccur[(chunck[word], chunck[j - word])] += 1 except: pass try: if (chunck[word], chunck[j + word]) not in self.cooccur.keys(): self.cooccur[(chunck[word], chunck[j + word])] = 0 elif self.occurance(chunck, chunck[word], chunck[j + word]): self.cooccur[(chunck[word], chunck[j + word])] += 1 except: pass
def split(self, text: str) -> List[Sentence]: sentences = [] offset = 0 plain_sentences = split_multi(text) for sentence in plain_sentences: sentence_offset = text.find(sentence, offset) if sentence_offset == -1: raise AssertionError( f"Can't find offset for sentences {plain_sentences} " f"starting from {offset}") sentences += [ Sentence(text=sentence, use_tokenizer=self._tokenizer, start_position=sentence_offset) ] offset += len(sentence) return sentences
def get_plain_text(cleaned_html_node, summary_sentences_qty): """ Summarizes text from html element. :param cleaned_html_node: html node to extract text sentences :param summary_sentences_qty: quantity of sentences of summarized text :return: summarized text, two-digit language code """ clean_text = "" # assembling text only with complete sentences, ended with respective punctuations. for node in cleaned_html_node.iter('p'): if node.text is not None: for sentence in split_multi(node.text): if len(sentence) > 0 and sentence[-1:] in ['.', '!', '?', '…'] and \ not sentence.strip(' .!?…').isdigit() and not dialog_re.match(sentence): clean_text = clean_text + ' ' + sentence # creating summary, obtaining language code and total sentences quantity final_result, lang_code, sent_qty = create_referat(clean_text, '', summary_sentences_qty) return final_result, lang_code
def textrank(text, hdr): # finding out the most possible language of the text lang_code = lang_identifier.classify(' '.join([hdr, text]))[0] # tokenizing for words sentences = [sentence for sentence in split_multi(text)] stemmer = snowballstemmer.stemmer(LANG_CODES.get(lang_code, 'english')) words = [set(stemmer.stemWord(word) for word in word_tokenizer(sentence.lower()) if word.isalpha()) for sentence in sentences] pairs = combinations(range(len(sentences)), 2) scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs] scores = filter(lambda x: x[2], scores) g = nx.Graph() g.add_weighted_edges_from(scores) pr = nx.pagerank(g) return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr), key=lambda x: pr[x[0]], reverse=True), lang_code
def split(self, text: str) -> List[Sentence]: plain_sentences: List[str] = split_multi(text) sentence_offset = 0 sentences: List[Sentence] = [] for sentence in plain_sentences: try: sentence_offset = text.index(sentence, sentence_offset) except ValueError as error: raise AssertionError( f"Can't find the sentence offset for sentence {repr(sentence)} " f"starting from position {repr(sentence_offset)}" ) from error sentences.append( Sentence( text=sentence, use_tokenizer=self._tokenizer, start_position=sentence_offset, )) sentence_offset += len(sentence) return sentences
def segment(text): ''' Split text into sentences using SegTok segmenter. ''' return split_multi(text)
def test_multi_spans(self): self.assertSequenceEqual(SPAN_TEST_ANSWER, list(split_multi(SPAN_TEST_TEXT)))
def test_multiline(self): text = "This is a\nmultiline sentence. And this is Mr.\nAbbrevation." ml_sentences = ["This is a\nmultiline sentence.", "And this is Mr.\nAbbrevation."] self.assertSequenceEqual(ml_sentences, list(split_multi(text)))
def sentence_tokenizer(self, string): tokenized_sentences = list(split_multi(string)) return tokenized_sentences
from segtok.segmenter import split_multi text = open("abbrev._text","r").read() sentences = split_multi(text) import results results.print_results(sentences) #for i in sentences: # print((i,))
def _build(self, text, windowsSize, n): text = self.pre_filter(text) self.sentences_str = [ [ w for w in split_contractions(web_tokenizer(s)) if not (w.startswith("'") and len(w) > 1) and len(w) > 0 ] for s in list(split_multi(text)) if len(s.strip()) > 0 ] self.number_of_sentences = len(self.sentences_str) pos_text = 0 block_of_word_obj = [] sentence_obj_aux = [] for (sentence_id, sentence) in enumerate(self.sentences_str): sentence_obj_aux = [] block_of_word_obj = [] for (pos_sent, word) in enumerate(sentence): if len([c for c in word if c in self.exclude]) == len( word ): # If the word is based on exclude chars if len(block_of_word_obj) > 0: sentence_obj_aux.append(block_of_word_obj) block_of_word_obj = [] else: tag = self.getTag(word, pos_sent) term_obj = self.getTerm(word) term_obj.addOccur(tag, sentence_id, pos_sent, pos_text) pos_text += 1 # Create co-occurrence matrix if tag not in self.tagsToDiscard: word_windows = list( range( max(0, len(block_of_word_obj) - windowsSize), len(block_of_word_obj), ) ) for w in word_windows: if block_of_word_obj[w][0] not in self.tagsToDiscard: self.addCooccur(block_of_word_obj[w][2], term_obj) # Generate candidate keyphrase list candidate = [(tag, word, term_obj)] cand = composed_word(candidate) self.addOrUpdateComposedWord(cand) word_windows = list( range( max(0, len(block_of_word_obj) - (n - 1)), len(block_of_word_obj), ) )[::-1] for w in word_windows: candidate.append(block_of_word_obj[w]) self.freq_ns[len(candidate)] += 1.0 cand = composed_word(candidate[::-1]) self.addOrUpdateComposedWord(cand) # Add term to the block of words' buffer block_of_word_obj.append((tag, word, term_obj)) if len(block_of_word_obj) > 0: sentence_obj_aux.append(block_of_word_obj) if len(sentence_obj_aux) > 0: self.sentences_obj.append(sentence_obj_aux) if len(block_of_word_obj) > 0: sentence_obj_aux.append(block_of_word_obj) if len(sentence_obj_aux) > 0: self.sentences_obj.append(sentence_obj_aux) self.number_of_words = pos_text
def test_multiline(self): text = "This is a\nmultiline sentence. And this is Mr.\nAbbrevation." ml_sentences = [ "This is a\nmultiline sentence.", "And this is Mr.\nAbbrevation." ] self.assertSequenceEqual(ml_sentences, list(split_multi(text)))