def tokenize_sentences(domain_concept, text): my_stop_words = ['say', '\'s', 'be', 'says', 'including', 'said', 'named', '\t', 'know', '\n\n', 'Des', ' ', ''] for stop_word in my_stop_words: lexeme = nlp.vocab[stop_word] lexeme.is_stop = True sentences = [] if not isinstance(text, float): text_sentences = textcleaner.split_sentences(text) cleaned_sentences = [] for sent in text_sentences: cleaned_sentences.append(nlp(sent.lower())) for sentence in cleaned_sentences: sent = [] for w in sentence: if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and len(w.text) > 1: sent.append(w.text.strip()) sentences.append(sent) else: logging.warning(domain_concept + ": does not have summary") pass return sentences
def edit_article(article): """ Gets an article db record reference edits it and saves it. """ update_log.info('Editing {}'.format(article.original_title)) summary = summarize(article.original_text) if article.original_language != EN: translate_langs = "{}-{}".format(article.original_language, EN) try: title = translate_this(article.original_title, translate_langs) summary = translate_this(summary, translate_langs) except Exception as err: update_log.error(err) return None else: title = article.title if summary is not None: article.title = title html_summary = "" for sent in textcleaner.split_sentences(summary): html_summary += "<p>{}</p>".format(sent) article.summary = html_summary article.keywords = gn_keywords(summary).replace("\n", ", ") article.status = READY article.save() update_log.info('Editing finished successfully!') else: update_log.error('Could not finished editing the article.') return article
def summarize(text, word_count=256): """ gensim summarizer 이용 https://github.com/anmolgulati/gensim/blob/df238ef1bc71568819ba92502f0e9df46b933698/gensim/summarization/summarizer.py corpus로 만든 후 word가 3개 이하면 워닝 발생, sentencerk 10개 이하면 warning 발생시킴 """ # Check if the text is too short. MIN_WORD_LENGTH = word_count MIN_SENTENCE_LENGTH = 2 word_num = len(set(text.split())) if word_num < MIN_WORD_LENGTH: return text else: sentence_num = len(split_sentences(text)) if sentence_num < MIN_SENTENCE_LENGTH: # logger.debug('too short text') # print('too short text') return text text_summarized = textrank_summarizer(text, word_count=word_count) text_summarized = re.sub('\n', ' ', text_summarized) if len(text_summarized) == 0: return text return text_summarized
def summary(x, perc): # x input document, perc: percentage of the original document to keep if len(split_sentences(x)) > 10: test_summary = summarize(x, ratio=perc, split=True) test_summary = '\n'.join(map(str, f(test_summary))) else: test_summary = x return test_summary
def create_paragraphs(article_text, paragraph_size): sentence_list = split_sentences(article_text) sentences_groups_list = group_sentences(sentence_list, 4) new_article = "" for group in sentences_groups_list: paragraph = " ".join(group) new_article = f"{new_article}{paragraph}<BR/><BR/>" return new_article
def readCorpus(fname, tokens_only=False, mode='w'): tokens = [] with smart_open.smart_open(fname, encoding="iso-8859-1") as f: for i, line in enumerate(f): if(mode == 's'): tokens.append(split_sentences(remove_stopwords(line))) else: # Train text with or without tags tokens.append(gensim.utils.simple_preprocess(remove_stopwords(line))) return tokens
def clean_data_to_format(directory, partition, part): print('Begin reading of data') _, texts = select_partition(directory, partition, part) print('Begin preprocessing of data') output_doc = '' output_sum = '' for text in texts: document, summary = split_doc(text) original_document = split_sentences(document) original_summary = split_sentences(summary) original_document = ' ###SENT### '.join(original_document) original_summary = ' ####SENT### '.join(original_summary) output_doc = output_doc + '\"' + original_document + '\" \n' output_sum = output_sum + '\"' + original_summary + '\" \n' print('Saving data') save_texts(directory, part + '.src', [output_doc], ['']) save_texts(directory, part + '.tgt', [output_sum], ['']) print('Saved data')
def summarize(self, msgs, range_spec=None): """Return a summary of the text TODO: 1. Looks like spacy is not getting the main sentence from the message. 2. Load times for the spacy summarizer won't cut it. Commenting out now until this can be fixed """ if not msgs or len(msgs) == 0: self.logger.warn("No messages to form summary") return u"\n Unable to form summary here.\n" txt = range_spec['txt'] if range_spec else u'Summary is' size = range_spec['size'] if range_spec and 'size' in range_spec else 3 summ = txt + u' ' #limit canonical dictionary to top 200 docs can_dict = {canonicalize(get_msg_text(msg)) : msg for msg in msgs} top_keys = sorted(can_dict.keys(), key=lambda x: len(x.split()), reverse=True)[:300] can_dict = {key: can_dict[key] for key in top_keys} self.logger.info("Length of can_dict is %s", len(can_dict)) simple_sum = u'\n'.join([self.tagged_sum(can_dict[ss]) for ss in sorted(can_dict.keys(), key=lambda x: len(x.split()), reverse=True)[:3]]) # If the number of messages or vocabulary is too low, just look for a # promising set of messages if len(msgs) < 11 or len(can_dict) < 11: #return the longest self.logger.warn("Too few messages for NLP.") summ += simple_sum else: max_sents = {} for (txt, msg) in can_dict.items(): if len(txt.split()) > 3: #Use the same splitting that gensim does for snt in split_sentences(txt): if len(snt.split()) > 100: snt = u' '.join(snt.split()[:100]) max_sents[snt] = msg ratio = (size * 2)/ float(len(max_sents.keys())) #ratio = 0.3 sent1 = u' '.join(can_dict.keys()) sent2 = u' '.join(max_sents.keys()) gn_sum = gs_sumrz(sent1, ratio=ratio, split=True)[:size] mx_sum = gs_sumrz(sent2, ratio=ratio, split=True)[:size] self.logger.info("Gensim sum %s", gn_sum) gs_summ = u'\n'.join([self.tagged_sum(can_dict[ss] if ss in can_dict else max_sents[ss]) for ss in gn_sum if len(ss) > 1 and (ss in max_sents or ss in can_dict)]) for ss in mx_sum: if ss not in max_sents and ss not in can_dict and len(ss.split()) > 5: self.logger.info("Searching for: %s", ss) for (ky, msg) in max_sents.items(): if ss in ky or (len(ky.split()) > 10 and ky in ss): gs_summ += u'\n' + self.tagged_sum(msg) if len(gn_sum) > 1: summ += gs_summ else: self.logger.warn("NLP Summarizer produced null output %s", gs_summ) summ += simple_sum self.logger.info("Summary for segment %s is %s", msgs, summ) return summ
def mk_bigrams(): with open(dump_base + "judgments", 'r', encoding="utf-8") as f: judgments = f.read() sentences = [list(gensim.utils.simple_tokenize(s)) for s in textcleaner.split_sentences(judgments)] bigramer = Phraser(Phrases(sentences)) bigramer.save(dump_base + "bigramer") return [bigramer[s] for s in sentences]
def function_summarize(): text = request.form['text'] sentences = split_sentences(text) if len(sentences) < 5: return jsonify({ "ERROR": "Not enough sentences found. There must be at least 5 sentences for summary." }), 400 processed_text = summarize(text) print(processed_text) dict_sample = {'key': processed_text} return jsonify(dict_sample)
def summarize_text(self, text: str): if len(split_sentences(text)) > 1: try: pred: str = summarize(text, **self.model_params) except ValueError: pred = text else: pred: str = text if not pred: pred = 'none' return pred
def split_and_preprocess(text: str, token_filters: List[Callable]) -> List[str]: # step 1 original_sentences = split_sentences(text) # step two filtered_sentences = [] for sentence in original_sentences: processed_sentence = preprocess_string(sentence, filters=token_filters) filtered_sentences.append(" ".join(processed_sentence)) sentences = merge_syntactic_units(original_sentences, filtered_sentences) return sentences
def textrank_summarize(corpus): print("Begin summarizing...") list_of_summarization = [] error_counter = 0 null_summarization_counter = 0 for i in range(len(corpus)): sample = corpus[i].strip() articles = sample.split("story_separator_special_tag") try: summarization = summarize("\n".join(articles), word_count=500, split=True) if len(summarization) == 0: null_summarization_counter += 1 summarization = split_sentences("\n".join(articles)) if len(summarization) == 0: print("*** No Summarization ***", i) except ValueError: print("ValueError, sample", sample) summarization = sample list_of_summarization.append(summarization) error_counter += 1 continue tmp_list_of_summarization = [[] for _ in range(len(articles))] for sent in summarization: flag = 0 for j in range(len(articles)): if sent in articles[j]: tmp_list_of_summarization[j].append(sent) flag = 1 if flag == 0: print(i, "****", sent, (sent in " ".join(articles))) for k in range(len(tmp_list_of_summarization)): tmp_list_of_summarization[k] = " newline_char ".join( tmp_list_of_summarization[k]) list_of_summarization.append( " story_separator_special_tag ".join(tmp_list_of_summarization)) if i % 100 == 0: print(i) print("------") # if i == 5000: # break return list_of_summarization, error_counter, null_summarization_counter
def processFile(sample): # read file from provided folder path # f = open(file_name,'r') # text_0 = f.read() text_0 = sample # extract content in TEXT tag and remove tags # text_1 = re.search(r"<TEXT>.*</TEXT>",text_0, re.DOTALL) # text_1 = re.sub("<TEXT>\n","",text_1.group(0)) # text_1 = re.sub("\n</TEXT>","",text_1) # # replace all types of quotations by normal quotes # text_1 = re.sub("\n"," ",text_1) # text_1 = re.sub("\"","\"",text_1) # text_1 = re.sub("''","\"",text_1) # text_1 = re.sub("``","\"",text_1) # text_1 = re.sub(" +"," ",text_1) # segment data into a list of sentences # sentence_token = nltk.data.load('tokenizers/punkt/english.pickle') # lines = sentence_token.tokenize(text_1.strip()) lines = split_sentences(text_0 + "\n") # setting the stemmer sentences = [] porter = nltk.PorterStemmer() # modelling each sentence in file as sentence object for line in lines: # original words of the sentence before stemming originalWords = line[:] line = line.strip().lower() # word tokenization sent = nltk.word_tokenize(line) # stemming words stemmedSent = [porter.stem(word) for word in sent] # stemmedSent = filter(lambda x: x!='.'and x!='`'and x!=','and x!='?'and x!="'" # and x!='!' and x!='''"''' and x!="''" and x!="'s", stemmedSent) # list of sentence objects if stemmedSent != []: # sentences.append(sentence.sentence(file_name, stemmedSent, originalWords)) sentences.append(sentence.sentence(stemmedSent, originalWords)) return sentences
def __init__(self, filename, tokenize=True): self.filename = filename with open(filename, 'r') as myfile: self.text = myfile.read() # -- Convert strange utf-8 bytes into punctuations -- # self.replaceStrangeChrs() # -- Preprocessing: generate a list of sentences -- # self.listOfSentences = [ sentence.lower() for sentence in split_sentences(self.text) ] # -- Optional: Perform word tokenization -- # if tokenize: self.tokenizeSentences()
def conversion(training_data_csv, vector_csv): training_data_csv.dropna(inplace=True) for index, datapoint in training_data_csv.iterrows(): temp_list = [] for sentence in split_sentences(datapoint.text): sentence = re.sub("[^A-Za-z]+", ' ', str(sentence)).lower() sentence = re.sub(r'\s+', ' ', str(sentence)) #print(word_tokenize(sentence)) temp_list.append(encode(word_tokenize(sentence))) temp_df = np.array(temp_list) temp_df = np.average(temp_df, axis=0) temp_df = DataFrame([[str(temp_df)]], columns=['vector']) temp_df.to_csv(vector_csv, index=False, header=False, mode='a')
def mymemory_translate(text, languages="el-en"): daily_limit = 1000 c = cache.get_item('mymemory_words_remaining') if c is None or c.is_expired(): c = cache.set_item('mymemory_words_remaining', daily_limit) c.set_expiration_date(timezone.now() + relativedelta(days=+1)) url = "http://api.mymemory.translated.net/get" langpair = languages.replace("-", "|") words_to_send = len(re.findall(r'\w+', text)) words_remaining = int(c.value) print('==> Words to send:') print(words_to_send) def translate(sentence): params = {"q": sentence, "langpair": langpair} session = get_tor_session() response_object = session.post(url, params) response = json.loads(response_object.text) if response['responseStatus'] != 200: update_log.warning('MyMemory responded with {}'.format( response['responseStatus'])) else: return response['responseData']['translatedText'] if words_remaining > words_to_send: translated_text = "" # The limit of characters for each request is 500 if len(text) > 500: sentences = textcleaner.split_sentences(text) for sent in sentences: sentence = translate(sent) if sent is not None: translated_text += sentence + "\r\n" else: return None else: translated_text = translate(text) words_remaining -= int(words_to_send) c.set_value(words_remaining) return translated_text else: update_log.warning('MyMemory reached the daily limit.') return None
def tokenize_sentences(domain_concept, text): sentences = [] if not isinstance(text, float): text_sentences = textcleaner.split_sentences(text) cleaned_sentences = [] for sent in text_sentences: cleaned_sentences.append(nlp(sent.lower())) for sentence in cleaned_sentences: sent = [] for w in sentence: if (not w.is_stop) and (not w.is_punct) and (not w.like_num) and (not w.like_url) and \ ('\n' not in w.text) and (' ' not in w.text) and (len(w.text) > 1): sent.append(lemmatizer.lemmatize(w.text.strip())) sentences.append(sent) else: logging.warning(domain_concept + ": does not have summary") pass return sentences
def lexrank_summarize(corpus): list_of_summarization = [] documents = [ split_sentences(sample.replace("story_separator_special_tag", "\n")) for sample in corpus ] print("[" + "Document Size: " + str(len(documents)) + "]") print("[" + time.strftime("%H:%M:%S", time.localtime()) + "]", "Begin building LexRank model...") lxr = LexRank(documents, stopwords=STOPWORDS['en']) print("[" + time.strftime("%H:%M:%S", time.localtime()) + "]", "LexRank model successfully built...") for i in range(len(documents)): sample = documents[i] summary = lxr.get_summary(sample, summary_size=len(sample)) articles = corpus[i].split("story_separator_special_tag") words_counter = 0 summary_counter = 0 tmp_summary = [ [] for _ in range(len(articles)) ] while words_counter < 500 and summary_counter < len(summary): flag = 0 for j in range(len(articles)): if summary[summary_counter] in articles[j]: tmp_summary[j].append(summary[summary_counter]) words_counter += len(summary[summary_counter].split(" ")) flag = 1 if flag == 0: print("[Error] Summary not in original sample.", summary[summary_counter], i) summary_counter += 1 # print("words_counter, summary_counter, total summary", words_counter, summary_counter, len(summary)) for k in range(len(tmp_summary)): tmp_summary[k] = " newline_char ".join(tmp_summary[k]) list_of_summarization.append(" story_separator_special_tag ".join(tmp_summary)) if i %100 == 0: print("------") print(i) print("------") # if i == 100: # break return list_of_summarization
def get_sentences(n_bytes, text): sents = [] count = 0 order = 0 prev_doc = "" # split into sentences with gensim splitter for line in split_sentences(text): doc = line orig = line tok = line sents.append(SimpleSentence(n_bytes, count, text)) if not (doc or orig or tok): break if doc != prev_doc: order = 0 text = orig count += 1 order += 1 prev_doc = doc return sents
def build_texts(path): ''' Input: Path containing text files Why: Prepare corpus docs for gensim Output: 2 Lists: List of lists, one for each doc, of comma-delimited words; List of lists, one for each doc, of comma-delimited sentences ''' raw_texts = [] processed_texts = [] path = pathlib.Path(path) for file in get_files(path): text = build_one(file, path) raw_texts.append(split_sentences(text)) processed_texts.append( gensim.utils.simple_preprocess( text, deacc=True, min_len=3)) # preprocess=preprocess return raw_texts, processed_texts
# вариант 2 text_tokenized = list(tokenize(text, lowercase=True)) # переменная, в которую будем складывать длину каждого слова total_length = 0 # цикл, в котором считаем длину каждого отдельного слова для нахождения среднего значения for word in text_normalized: total_length += len(word) # записываем результат writing( 'Статистика по тексту.txt', 'w', '\t\tСтатистические данные по тексту "city-smells".\n\n1. Средняя длина ' 'слова в тексте: ' + str(round(total_length / len(text_normalized))) + ' символов;') # 2. Смотрим среднюю длину предложения в тексте (я взяла функцию из модуля gensim): sentence_list = list(split_sentences(text)) # из предыдущего задания знаем общее количество слов: writing( 'Статистика по тексту.txt', 'a', '\n2. Средняя длина предложения в тексте: ' + str(round(len(text_tokenized) / len(sentence_list))) + ' слов;') # 3. Во сколько раз самое длинное предложение длиннее самого короткого: # по символам: sentence_sizes = [] for sentence in sentence_list: sentence_sizes.append(len(sentence)) writing( 'Статистика по тексту.txt', 'a', '\n3. Cамое длинное предложение длиннее самого (по символам) короткого ' 'предложения в ' + str(max(sentence_sizes) // min(sentence_sizes)) +
def to_sentences(book): sentences = textcleaner.split_sentences(book) sentence_tokens = [simple_preprocess(sentence) for sentence in sentences] return sentence_tokens
def tokenize_sentences(domain_concept, text): my_stop_words = ["a", "about", "above", "after", "again", "against", "ain", "all", "am", "an", "and", "any", "are", "aren", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can", "couldn", "couldn't", "d", "did", "didn", "didn't", "do", "does", "doesn", "doesn't", "doing", "don", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn", "hadn't", "has", "hasn", "hasn't", "have", "haven", "haven't", "having", "he", "her", "here", "hers", "herself", "him", "himself", "his", "how", "i", "if", "in", "into", "is", "isn", "isn't", "it", "it's", "its", "itself", "just", "ll", "m", "ma", "me", "mightn", "mightn't", "more", "most", "mustn", "mustn't", "my", "myself", "needn", "needn't", "no", "nor", "not", "now", "o", "of", "off", "on", "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own", "re", "s", "same", "shan", "shan't", "she", "she's", "should", "should've", "shouldn", "shouldn't", "so", "some", "such", "t", "than", "that", "that'll", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", "those", "through", "to", "too", "under", "until", "up", "ve", "very", "was", "wasn", "wasn't", "we", "were", "weren", "weren't", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "won", "won't", "wouldn", "wouldn't", "y", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves", "could", "he'd", "he'll", "he's", "here's", "how's", "i'd", "i'll", "i'm", "i've", "let's", "ought", "she'd", "she'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "we'd", "we'll", "we're", "we've", "what's", "when's", "where's", "who's", "why's", "would", "able", "abst", "accordance", "according", "accordingly", "across", "act", "actually", "added", "adj", "affected", "affecting", "affects", "afterwards", "ah", "almost", "alone", "along", "already", "also", "although", "always", "among", "amongst", "announce", "another", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "apparently", "approximately", "arent", "arise", "around", "aside", "ask", "asking", "auth", "available", "away", "awfully", "b", "back", "became", "become", "becomes", "becoming", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "believe", "beside", "besides", "beyond", "biol", "brief", "briefly", "c", "ca", "came", "cannot", "can't", "cause", "causes", "certain", "certainly", "co", "com", "come", "comes", "contain", "containing", "contains", "couldnt", "date", "different", "done", "downwards", "due", "e", "ed", "edu", "effect", "eg", "eight", "eighty", "either", "else", "elsewhere", "end", "ending", "enough", "especially", "et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "except", "f", "far", "ff", "fifth", "first", "five", "fix", "followed", "following", "follows", "former", "formerly", "forth", "found", "four", "furthermore", "g", "gave", "get", "gets", "getting", "give", "given", "gives", "giving", "go", "goes", "gone", "got", "gotten", "h", "happens", "hardly", "hed", "hence", "hereafter", "hereby", "herein", "heres", "hereupon", "hes", "hi", "hid", "hither", "home", "howbeit", "however", "hundred", "id", "ie", "im", "immediate", "immediately", "importance", "important", "inc", "including", "indeed", "index", "information", "instead", "invention", "inward", "itd", "it'll", "j", "k", "keep", "keeps", "kept", "kg", "km", "know", "known", "knows", "l", "largely", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "lets", "like", "liked", "likely", "line", "little", "'ll", "look", "looking", "looks", "ltd", "made", "mainly", "make", "makes", "many", "may", "maybe", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "million", "miss", "ml", "moreover", "mostly", "mr", "mrs", "much", "mug", "must", "n", "na", "name", "named", "namely", "nay", "nd", "near", "nearly", "necessarily", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "ninety", "nobody", "non", "none", "nonetheless", "noone", "normally", "nos", "noted", "nothing", "nowhere", "obtain", "obtained", "obviously", "often", "oh", "ok", "okay", "old", "omitted", "one", "ones", "onto", "ord", "others", "otherwise", "outside", "overall", "owing", "p", "page", "pages", "part", "particular", "particularly", "past", "per", "perhaps", "placed", "please", "plus", "poorly", "possible", "possibly", "potentially", "pp", "predominantly", "present", "previously", "primarily", "probably", "promptly", "proud", "provides", "put", "q", "que", "quickly", "quite", "qv", "r", "ran", "rather", "rd", "readily", "really", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "respectively", "resulted", "resulting", "results", "right", "run", "said", "saw", "say", "saying", "says", "sec", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sent", "seven", "several", "shall", "shed", "shes", "show", "showed", "shown", "showns", "shows", "significant", "significantly", "similar", "similarly", "since", "six", "slightly", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specifically", "specified", "specify", "specifying", "still", "stop", "strongly", "sub", "substantially", "successfully", "sufficiently", "suggest", "sup", "sure", "take", "taken", "taking", "tell", "tends", "th", "thank", "thanks", "thanx", "thats", "that've", "thence", "thereafter", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "thereto", "thereupon", "there've", "theyd", "theyre", "think", "thou", "though", "thoughh", "thousand", "throug", "throughout", "thru", "thus", "til", "tip", "together", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying", "ts", "twice", "two", "u", "un", "unfortunately", "unless", "unlike", "unlikely", "unto", "upon", "ups", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "v", "value", "various", "'ve", "via", "viz", "vol", "vols", "vs", "w", "want", "wants", "wasnt", "way", "wed", "welcome", "went", "werent", "whatever", "what'll", "whats", "whence", "whenever", "whereafter", "whereas", "whereby", "wherein", "wheres", "whereupon", "wherever", "whether", "whim", "whither", "whod", "whoever", "whole", "who'll", "whomever", "whos", "whose", "widely", "willing", "wish", "within", "without", "wont", "words", "world", "wouldnt", "www", "x", "yes", "yet", "youd", "youre", "z", "zero", "a's", "ain't", "allow", "allows", "apart", "appear", "appreciate", "appropriate", "associated", "best", "better", "c'mon", "c's", "cant", "changes", "clearly", "concerning", "consequently", "consider", "considering", "corresponding", "course", "currently", "definitely", "described", "despite", "entirely", "exactly", "example", "going", "greetings", "hello", "help", "hopefully", "ignored", "inasmuch", "indicate", "indicated", "indicates", "inner", "insofar", "it'd", "keep", "keeps", "novel", "presumably", "reasonably", "second", "secondly", "sensible", "serious", "seriously", "sure", "t's", "third", "thorough", "thoroughly", "three", "well", "wonder", "a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "Des", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "co", "op", "research-articl", "pagecount", "cit", "ibid", "les", "le", "au", "que", "est", "pas", "vol", "el", "los", "pp", "u201d", "well-b", "http", "volumtype", "par", "0o", "0s", "3a", "3b", "3d", "6b", "6o", "a1", "a2", "a3", "a4", "ab", "ac", "ad", "ae", "af", "ag", "aj", "al", "an", "ao", "ap", "ar", "av", "aw", "ax", "ay", "az", "b1", "b2", "b3", "ba", "bc", "bd", "be", "bi", "bj", "bk", "bl", "bn", "bp", "br", "bs", "bt", "bu", "bx", "c1", "c2", "c3", "cc", "cd", "ce", "cf", "cg", "ch", "ci", "cj", "cl", "cm", "cn", "cp", "cq", "cr", "cs", "ct", "cu", "cv", "cx", "cy", "cz", "d2", "da", "dc", "dd", "de", "df", "di", "dj", "dk", "dl", "do", "dp", "dr", "ds", "dt", "du", "dx", "dy", "e2", "e3", "ea", "ec", "ed", "ee", "ef", "ei", "ej", "el", "em", "en", "eo", "ep", "eq", "er", "es", "et", "eu", "ev", "ex", "ey", "f2", "fa", "fc", "ff", "fi", "fj", "fl", "fn", "fo", "fr", "fs", "ft", "fu", "fy", "ga", "ge", "gi", "gj", "gl", "go", "gr", "gs", "gy", "h2", "h3", "hh", "hi", "hj", "ho", "hr", "hs", "hu", "hy", "i", "i2", "i3", "i4", "i6", "i7", "i8", "ia", "ib", "ic", "ie", "ig", "ih", "ii", "ij", "il", "in", "io", "ip", "iq", "ir", "iv", "ix", "iy", "iz", "jj", "jr", "js", "jt", "ju", "ke", "kg", "kj", "km", "ko", "l2", "la", "lb", "lc", "lf", "lj", "ln", "lo", "lr", "ls", "lt", "m2", "ml", "mn", "mo", "ms", "mt", "mu", "n2", "nc", "nd", "ne", "ng", "ni", "nj", "nl", "nn", "nr", "ns", "nt", "ny", "oa", "ob", "oc", "od", "of", "og", "oi", "oj", "ol", "om", "on", "oo", "oq", "or", "os", "ot", "ou", "ow", "ox", "oz", "p1", "p2", "p3", "pc", "pd", "pe", "pf", "ph", "pi", "pj", "pk", "pl", "pm", "pn", "po", "pq", "pr", "ps", "pt", "pu", "py", "qj", "qu", "r2", "ra", "rc", "rd", "rf", "rh", "ri", "rj", "rl", "rm", "rn", "ro", "rq", "rr", "rs", "rt", "ru", "rv", "ry", "s2", "sa", "sc", "sd", "se", "sf", "si", "sj", "sl", "sm", "sn", "sp", "sq", "sr", "ss", "st", "sy", "sz", "t1", "t2", "t3", "tb", "tc", "td", "te", "tf", "th", "ti", "tj", "tl", "tm", "tn", "tp", "tq", "tr", "ts", "tt", "tv", "tx", "ue", "ui", "uj", "uk", "um", "un", "uo", "ur", "ut", "va", "wa", "vd", "wi", "vj", "vo", "wo", "vq", "vt", "vu", "x1", "x2", "x3", "xf", "xi", "xj", "xk", "xl", "xn", "xo", "xs", "xt", "xv", "xx", "y2", "yj", "yl", "yr", "ys", "yt", "zi", "zz"] for stop_word in my_stop_words: lexeme = nlp.vocab[stop_word] lexeme.is_stop = True sentences = [] if not isinstance(text, float): text_sentences = textcleaner.split_sentences(text) cleaned_sentences = [] for sent in text_sentences: cleaned_sentences.append(nlp(sent.lower())) for sentence in cleaned_sentences: sent = [] for w in sentence: if (not w.is_stop) and (not w.is_punct) and (not w.like_num) and (not w.like_url) and \ ('\n' not in w.text) and (' ' not in w.text) and (len(w.text) > 1): sent.append(w.text.strip()) sentences.append(sent) else: logging.warning(domain_concept + ": does not have summary") pass return sentences
# create list of sentence tokens sents_list = [] for sent in doc.sents: sents_list.append(sent.text) sents_list ############################################################################### #5. Tokenization using Keras from keras.preprocessing.text import text_to_word_sequence #Word Tokenization result = text_to_word_sequence(text) result ############################################################################### #6. Tokenization using Gensim from gensim.utils import tokenize #Word Tokenization list(tokenize(text)) #Sentence Tokenization from gensim.summarization.textcleaner import split_sentences result = split_sentences(text) result ############################################################################### ###############################################################################
doc2vec_epochs = settings[ 'doc2vec_epochs'] if 'doc2vec_epochs' in settings.keys() else 20 if 'doc2vec_epochs' not in settings.keys(): logger.warning( 'doc2vec epochs not in settings; using default value {}.'. format(doc2vec_epochs)) else: logger.info('doc2vec epochs: {}'.format(doc2vec_epochs)) with open(input_file, 'r') as input_fp: text = input_fp.read() text = text.split('\n') text = text[text_start:text_stop] text = ' '.join(text) logger.info('text length: {}'.format(len(text))) sentences = split_sentences(text) if pieces_strategy == pieces_strategies[0]: pieces = [ text[i:i + context_limit_] for i in range(0, len(text), context_limit_) ] + [ text[i + context_limit_ // 2:i + 3 * context_limit_ // 2] for i in range(0, len(text) - context_limit_, context_limit_) ] elif pieces_strategy == pieces_strategies[1]: pieces = [ ' '.join(sentences[index:index + sentences_per_chunk]) for index in range(0, len(sentences), sentences_per_chunk) ] else:
# In[7]: logging.info(f' count tokens = {tokenize.all_count_token}') logging.info(f' vocab size = {len(vocab)}') # In[8]: vocab_txt = 'data/clear_data/vocab.txt' with open(vocab_txt, 'a') as f: for token, token_count in vocab: f.write(token + '\n') # In[9]: sentences_df = data.text.apply(lambda x: split_sentences(x)) # In[10]: sep = int(len(sentences_df) * 0.8) # In[11]: train_sentences_txt = 'data/clear_data/train_sentences.txt' heldout_sentences_txt = 'data/clear_data/heldout _sentences.txt' with open(train_sentences_txt, 'a') as f: for sentence_list in sentences_df[:sep]: for sentence in sentence_list: f.write(sentence[0].lower() + sentence[1:] + '\n')
def summarize_text(self, text: str): sentences = split_sentences(text) if 'риа новости' in sentences[0]: return sentences[1] else: return sentences[0]
# whether to run tests test_acc = False # currently available: nyt, washpo source_name = 'nyt' source_path = 'source_embeddings/' + source_name data_source = 'nexis' if not os.path.isfile(source_path) or force_retrain: with open('../data/%s.csv' % source_name) as f: reader = csv.reader(f) articles = [r[1] for r in reader] sentences = [] for article in articles: art = split_sentences(article) sentences += [list(tokenize_by_word(sen)) for sen in art] bigram_transformer = Phrases(sentences) sentences = bigram_transformer[sentences] model = gensim.models.Word2Vec(sentences, size=100, window=10, min_count=2, workers=10) model.train(sentences, total_examples=len(sentences), epochs=50) model.save(source_path) else: model = gensim.models.Word2Vec.load(source_path) if test_acc: model.accuracy('questions-words.txt')
mode='w+') # File created by code print("read unprocessed text") df = read_csv(custom_embedding_text_data, encoding='utf-8') print('replace all nan with empty string') df.replace(nan, '', regex=True, inplace=True) print("drop all nan") df = df.dropna().reset_index(drop=True) df = DataFrame(df.text.unique(), columns=['text']) print("remove tags from unprocessed text and write to temporary clean text") for i in range(len(df)): DataFrame(split_sentences(remove_tags(df.iloc[i]['text'])), columns=['temp_clean'], dtype=str).\ to_csv('temporary_clean_text.csv', header=False, index=False, mode='a') logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt='%H:%M:%S', level=logging.INFO) del df gc.collect() print("read temporary clean text to convert to lower case") df_temp_clean = read_csv('temporary_clean_text.csv', usecols=['temp_clean'], dtype={'temp_clean': str}, lineterminator='\n') brief_cleaning = (sub("[^A-Za-z]+", ' ', str(row)).lower()
def splitToSentences(text): return textcleaner.split_sentences(text)