def chunkScript(movie, max_len=300): """ add `[SEP]` token at the end of each sentence in script """ outscript = [''] # movie = re.sub('\s+',' ', movie) prev_len = 0 try: segmentation = list(segmenter.process(movie)) for paragraph in segmentation: for s in paragraph: sent = ' '.join([t.value for t in s]) + ' [SEP]' if (prev_len + len(s) < max_len - 2): outscript[-1] += sent prev_len += len(s) elif (len(s) < max_len - 2): outscript.append(sent) prev_len = len(s) else: # new sentence is longer than max_len # print(len(outscript)) outscript += [ ' '.join(__extractValueFromToken__(s2)) for s2 in chunkset(s, max_len - 1) ] # print(len(outscript)) prev_len = len(outscript[-1]) except: print("segmentatin error for movie:", movie) return [] return outscript
def main(): parser = argparse.ArgumentParser(description='Preprocess corpus') parser.add_argument('--config_file', type=str, default='configs/echoes_local.config', help='location of the configuration file') args = parser.parse_args() config = configparser.ConfigParser() config.read(args.config_file) orig_dir = config['general']['orig_dir'] logging.info(f'Preprocessing corpus files under: {orig_dir}') corpus_file = config['general']['corpus_file'] max_files = int(config['general']['max_files']) fns = glob.glob(f'{orig_dir}/*.txt') if max_files > -1: fns = list(fns)[:max_files] metadata = pd.read_excel(config['general']['metadata']) metadata.fillna('', inplace=True) metadata.set_index('filepath', inplace=True) with open(corpus_file, 'w') as corpusf: for fn in tqdm(fns): with open(fn, encoding='utf-8-sig') as f: # try to extract metadata: md = {'author': 'Unknown', 'title': 'Unknown'} try: r = metadata.loc[os.path.basename(fn).replace('.txt', '')] if r['author:lastname']: md['author'] = r['author:lastname'] if r['title:detail']: md['title'] = r['title:detail'] except KeyError: pass # extract sentences: sentences = [] for paragraph in segmenter.process(f.read()): for sentence in paragraph: # get original sentence: string_repr = '' for token in sentence: string_repr += f'{token.spacing}{token.value}' string_repr = WHITESPACE.sub(' ', string_repr).strip() # get individual tokens: tokens = [t.value.strip() for t in sentence] tokens = [t for t in tokens if t] if tokens and string_repr: sentences.append({'sentence': string_repr, 'tokens': tokens}) if sentences: sentences = {'metadata': md, 'sentences': sentences} corpusf.write(json.dumps(sentences) + '\n') logging.info(f'Finished tokenizing corpus to: {corpus_file}')
def tokenize(text, lemma_stop=False): sentenceC = [] for parg in segmenter.process(text): for sentence in parg: sentenceC.append(' '.join([token.value for token in sentence])) sub = sentenceC return sentenceC
def segment_abstract(text_stream, segout_name): segmented_outfile = open("pubmed_seg_txts/" + segout_name, "a+", encoding='utf-8') for paragraph in segmenter.process(text_stream): for sentence in paragraph: for token in sentence: segmented_outfile.write(token.value + ' ') segmented_outfile.write('\n') print() segmented_outfile.close()
def test_process(self): for paragraph in segmenter.process(DOCUMENT): offset = 0 for sentence in paragraph: for token in sentence: if token.value and token.value != "tincidunt": # tin-cidunt linebreak! new_offset = DOCUMENT.find(token.value, offset) self.assertNotEqual(new_offset, -1, repr(token) + " at %d" % offset) offset = new_offset + len(token.value)
def segment(text: str) -> List[List[ConllToken]]: sentences = [] for paragraph in segmenter.process(text): for sentence in paragraph: sent: List[ConllToken] = [] sentences.append(sent) for i, token in enumerate(sentence): sent.append(ConllToken(space_prefix=token.spacing, value=token.value, offset=token.offset)) if i > 0: sent[-2].space_after = sent[-1].spacing == ' ' return sentences
def process_file(input_file, clean_string=False, verbose=False): """Read an input (html) file from disk and process to paragraph-size chunks""" # Create labels from file path labels = '+'.join(input_file.split("/")[2:4]) # Open input file with input_file.open() as f: file_as_string = f.read() # Process html paragraphs_from_html = '\n\n'.join(html2paragraph(file_as_string)) paragraphs = segmenter.process(paragraphs_from_html) paragraphs_new = [] for k, paragraph in enumerate(paragraphs): paragraph_new = [] for sentence in paragraph: sentence_new = [] for token in sentence: sentence_new.append(token.spacing + token.value) # Join sentence_new_joined = "".join(sentence_new) # Clean if clean_string: # Remove citations sentence_new_joined = re.sub(r'\[[0-9]{1,3}\]', '', sentence_new_joined) # Remove anything between round brackets sentence_new_joined = re.sub(r'\(.*\)', '', sentence_new_joined) # Remove anything between square brackets sentence_new_joined = re.sub(r'\[.*\]', '', sentence_new_joined) if not sentence_new_joined.endswith("Wikipedia") and len( sentence_new_joined.split(" ")) > MIN_SENTENCE_TOKENS: paragraph_new.append(sentence_new_joined) # Paragraph must be > 5 if len(" ".join(paragraph_new).split(" ")) > MIN_PARAGRAPH_TOKENS: paragraphs_new.append("".join(paragraph_new)) # Print if verbose: if len(paragraphs_new) > 0: print( "\t[==>] Snippets: [{}]\n\t[==>] Sample: {}\n\t[==>] Length: {}\n" .format(len(paragraphs_new), paragraphs_new[0], len(paragraphs_new[0].split(" ")))) # Return return (paragraphs_new, labels)
def fasttext_iter_all_documents(input): import syntok.segmenter as segmenter for article in input.glob('./*/*/*'): label = '__label__' + '-'.join( tokenize('-'.join(str(article).split('/')[1:3]))) print('* handling {} for {}'.format(article, label)) with article.open() as f: string = f.read() paragraphs = '\n\n'.join(html2paragraph(string)) paragraphs = segmenter.process(paragraphs) for paragraph in paragraphs: for sentence in paragraph: sentence = ' '.join(token.value for token in sentence) yield label, sentence
def fix_sentences(text): out = '' for paragraph in segmenter.process(text): for sentence in paragraph: for token in sentence: # roughly reproduce the input, # except for hyphenated word-breaks # and replacing "n't" contractions with "not", # separating tokens by single spaces out += token.value + ' ' out += '\n' # print one sentence per line out += '\n' return out
def fasttext_estimate(input): import fastText as ft import syntok.segmenter as segmenter model = ft.load_model(input) string = string = sys.stdin.read() paragraphs = '\n\n'.join(html2paragraph(string)) paragraphs = segmenter.process(paragraphs) counter = Counter() for paragraph in paragraphs: for sentence in paragraph: sentence = ' '.join(token.value for token in sentence) out = model.predict(sentence, 73) # number of labels out = {out[0][i]: out[1][i] for i in range(73)} counter = counter + Counter(out) for key, value in counter.most_common(10): print('{}\t\t{}'.format(key, value))
def sentence_segmenter(document): sentences = [] for paragraph in segmenter.process(document): for sentence in paragraph: s_sentence = "" for token in sentence: # roughly reproduce the input, # except for hyphenated word-breaks # and replacing "n't" contractions with "not", # separating tokens by single spaces # print(token.value, end=' ') s_sentence += token.value + " " # print() # print one sentence per line sentences.append(s_sentence) # print() # separate paragraphs with newlines return "\n".join(sentences)
def tokenize_input_doc(input_doc, token_lower=True, token_lower_digit=False, max_sentences=15): # Sentences sent_out = list() a = segmenter.process(input_doc) # For each paragraph, do ... for par in a: # For each sentence in the paragraph, do ... for sent in par: csent = "".join([token.spacing + token.value for token in sent]).strip() text_clean = tokenize_text(csent, token_lower, token_lower_digit) if text_clean is None: continue sent_out.append(text_clean) # Vectorize each input document docs_vectorized = tokenizer.texts_to_sequences(sent_out) return (docs_vectorized)
def find_organisations_reasons(folder: str): """ Go through files in the given folder, extract organisation names and their reason for appearance in file. """ org_reasons, org_counts = {}, {} try: # Get flair models. ner_tagger, frame_tagger, pos_tagger = get_flair_taggers() # Fetch results from cache, if present. files_processed, org_reasons, org_counts = check_cache() file_count = 1 if len(files_processed) == 0 \ else len(files_processed) + 1 # Find files to process from path. files = glob.glob(f"{folder}/*.txt") print(f"Processing {len(files)} files in '{folder}'.") # Remove previously processed file names. to_process = [f for f in files if f not in files_processed] for path in to_process: print(f"[{file_count}/{len(files)}] Processing {path}...") file = open(path, "r") # Go through paragraphs sentence by sentence and extract information. paragraphs = process(file.read()) for sentences_tokenized in paragraphs: for tokens in sentences_tokenized: sentence = "" for token in tokens: sentence += f"{token.spacing}{token.value}" sentence = Sentence(sentence.strip()) # Add NER, POS and Semantic Frame Detection tags to sentence. ner_tagger.predict(sentence) frame_tagger.predict(sentence) pos_tagger.predict(sentence) # Extract all organisations. organisations = get_organisations(sentence) if not organisations: continue # Find the first organisation occurence and its reason for appearance. for first in organisations[:1]: name = clean_organization(first.text) reason = get_reason_for_appearance(first, sentence) add_to_organisation(name, reason, org_counts, org_reasons) # Count remaining organisations, but don't find its reason for appearance, # since the other organisations following the first one don't have meaningful reasons, # leading to broken sentences. for remaining in organisations[1:]: name = clean_organization(remaining.text) add_to_organisation(name, None, org_counts, org_reasons) files_processed.append(path) # Store in cache after processing. dump_to_cache(files_processed, org_reasons, org_counts) file_count += 1 if (org_reasons['I']): org_reasons.pop('I', None), org_counts.pop('I', None) if (org_reasons['We']): org_reasons.pop('We', None), org_counts.pop('We', None) print(f"\nFinished processing {file_count} files.") return org_reasons, org_counts except Exception as e: # Handle early exit by user (CTRL+C). print(e) print("\n\nExiting...") print(f"Finished processing {file_count} files.") return org_reasons, org_counts
def extract_formatted_sentences(document): sentences = [] for paragraph in segmenter.process(document): for sentence in paragraph: sentences.append(" ".join([token.value for token in sentence])) return sentences
def extract_paragraphs(document): return [paragraph for paragraph in segmenter.process(document)]
def handle_page_paragraphs(page_num, page_text, doc_dict): for paragraph_num, paragraph in enumerate(segmenter.process(page_text)): paragraph_text = get_paragraph_text(paragraph) paragraph_dict = create_paragraph_dict(page_num, paragraph_num, paragraph_text, doc_dict) doc_dict["paragraphs"].append(paragraph_dict)
def _segment_text_into_sentences(raw_sentence: str): sentences = [] for segmented_sentences in segmenter.process(raw_sentence): for sentence in segmented_sentences: sentences.append("".join(map(str, sentence)).lstrip()) return sentences
def _segment_text_into_sentences(article: str): sentences = [] for paragraph in segmenter.process(article): for sentence in paragraph: sentences.append("".join(map(str, sentence)).lstrip()) return sentences
if lnsp[0] != previous_docnr: if sentcount < 15: # Take doc number docnr = lnsp[0] # Add to inputs inputs[lbl0][docnr] = sent_level # Reset sent_level = list() sentcount = 0 docfinish = False # Set previous doc number to current previous_docnr = lnsp[0] # Process each sentence of paragraph, unless already have enough sentences if docfinish: continue # Else, use syntok to segment a paragraph into sentences a = segmenter.process(lnsp[-1]) # For each paragraph, do ... for par in a: # For each sentence in the paragraph, do ... for sent in par: csent = "".join( [token.spacing + token.value for token in sent]).strip() # Tokenize text txt_tok = tokenize_text( csent, lower_tokens=args.token_lower, remove_digits_token=args.token_remove_digits) # If none, pass ... if txt_tok is None: failed.append(csent) continue
def extract_sentences(document): sentences = [] for paragraph in segmenter.process(document): for sentence in paragraph: sentences.append(sentence) return sentences