def match_questions_tfidf(questions, clusters): # clusters = { # "questions": [ 'and the moon too', 'lets show some' ], # "clusterIds": [ 4, 4 ] # } # questions = [ { # "id": 11, "question": 'Another one about the sun?' # }, # { # "id": 33, # "question": 'What is the distance from the sun though?' }, # { # "id": 37, # "question": 'what\'s the changing factors of the sun and moon together?' # } ] # Clean and Lemmatize all the questions in the clusters for idx, question in enumerate(clusters["questions"]): question = clean_text(question.replace("\n", "")) clusters["questions"][idx] = " ".join([d.lemma_ for d in nlp(question)]) # Clean and Lemmatize all the question in the orphan group for idx, question in enumerate(questions): question = clean_text(questions[idx]["question"].replace("\n", "")) questions[idx]["question"] = " ".join([d.lemma_ for d in nlp(question)]) # Create corpus completeCorpus = clusters["questions"] + [q["question"] for q in questions] # completeCorpus = " ".join(clusters["questions"]) # completeCorpus += " ".join([q["question"] for q in questions]) vectorizer = TfidfVectorizer( analyzer='char', ngram_range=(1, 3), use_idf=True, sublinear_tf=True, smooth_idf=True, stop_words='english' ) vectors = vectorizer.fit(completeCorpus) # add tfidf vectors to the clusters object for each question clusters["question_vectors"] = vectorizer.transform(clusters["questions"]).toarray() # for idx, question in enumerate(clusters["questions"]): # clusters["question_vectors"].append(vectorizer.transform(question).toarray()) # for idx, question in enumerate(questions): question = questions[idx]["question"] questions[idx]["question_vector"] = vectorizer.transform([question]).toarray()[0] # print("\n Clusters: ", clusters) # print("\n Questions: ", questions) # print("\n Complete Corpus: ", completeCorpus) cluster.findBestFitCluster(questions, clusters) return True
def run_pipeline(self): os.chdir(self.loc) print("normalize and tokenize file") normalized_file_name = self.file_name.split('.')[0] + '.norm.tok.txt' normalize_file(self.lang, self.file_name, normalized_file_name) print("removing punctuation with single space and removing lines with foreign characters") clean_file_name = normalized_file_name.replace('.txt', '.clean.txt') clean_text(normalized_file_name, clean_file_name, self.dict_path) print("removing duplicate lines") unique_file_name = clean_file_name.replace('.txt', '.unique.txt') remove_duplicate(clean_file_name, unique_file_name) print('completed')
def iterateWordClouds(): cloudfolder = "main_img/WordClouds/" sheets = ['Companies', 'Founders', 'VC', 'Events', 'Institutions'] wordcloud_columns = [ 'Description', 'About', "Detailed_Description", "ShortDescription" ] # Get Text Columns's WordClouds for sheet in sheets: # sheet = "Founders" d = pd.read_excel(io=path + tables, sheet_name=sheet) for column in wordcloud_columns: # column = "Description" print("%s: %s" % (sheet, column)) if column in d.columns: # # Filter those without description filename = "%s%s %s.png" % (cloudfolder, sheet, column) d[column] = d[column].apply(lambda x: " ".join(clean_text(x)) if type(x) is str else None) text = d[~d[column].isnull()][column] text = " ".join(text.values) if (sheet == "Companies"): text = text.replace("product", "").replace("servic", "").replace("compani", "") elif (sheet == "Founders"): text = text.replace("founder", "").replace("cofound", "") GetWordCloud(text, filename, width=890, height=420)
def _clean_texts(input_text, file_format, twitter): delimiter = DELIMITER[file_format] joiner = JOINER[file_format] return joiner.join([ clean_text(text=text, twitter=twitter) for text in input_text.split(delimiter) ])
def predict(self, x): predicts = [] for test_str in x: test_str = "<s> " + test_str + " </s>" a = clean_text.clean_text(test_str) #print(a) a = a.split(' ') x_seq = numpy.zeros((len(a), self.n_in), dtype='float64') y_seq = numpy.zeros((len(a),), dtype='int32') for i in range(len(a)): word = a[i] if word in self.word2label: y_seq[i] = self.word2label[word] else: y_seq[i] = self.word2label["%%%"] if word in self.model: x_seq[i, :] = self.model[word] else: x_seq[i, :] = self.model["xxxxx"] #print(y_seq) predicts.append(self.f_pred(*[x_seq, y_seq])) return predicts
def get(self, model, text): if model.lower() == "use_large": text = clean_text(text) vecs = np.array(use_large([text])) return {"use_large": vecs[0].tolist()} else: return {"error": "unknown model"}
def calc_flexibility_and_elaboration_multi_target(responses, target_words, nlp): """""" data = pd.DataFrame({'response': responses, 'target_word': target_words}) data['clean_response'] = [ clean_text(response, nlp) for response in data.response ] data['elaboration'] = data.apply( lambda row: len(row.clean_response.split()) if row.clean_response is not None else None, axis=1) # to control for effects of response length (elaboration) on semantic similarity, calculate similarity expected by # chance for all given response lengths to subtract from response similarity # (Forthmann et al, 2018 https://doi.org/10.1002/jocb.240) bootstrapped_sims = {} for target in data.target_word.unique(): word_counts = data.elaboration.loc[data.target_word == target].unique() bootstrapped_sims[target] = bootstrap_similarity(word_counts, target) data['raw_similarity'] = data.apply( lambda row: calc_similarity(row.clean_response, row.target_word, nlp), axis=1) data['corrected_similarity'] = data.apply( lambda row: row.raw_similarity - bootstrapped_sims[row.target_word][ row.elaboration] if not np.isnan(row.elaboration) else row.raw_similarity, axis=1) # flexibility is dissimilarity score, so invert the similarity score to get flexibility data['flexibility'] = (1 - abs(data['corrected_similarity'])) return data[['clean_response', 'elaboration', 'flexibility']]
def match_questions_with_categories(questions, clusters): """A simple matching algorithm that places questions into a pre-created cluster if: 1. The question's lemmatized form contains the cluster's keyword 2. The question contains no rarer English words that are also cluster keywords Parameters: questions (list[dict]): A list of dictionaries with an id and question (text) field clusters (list[string]): A list of pre-created keywords """ cluster_additions = { "uncategorized": [] } for question in questions: clean_question = clean_text(question["question"].replace("\n", "")) cluster_options = set() for token in nlp(clean_question): if token.lemma_ in clusters: cluster_options.add(token.lemma_) if len(cluster_options) == 0: cluster_additions["uncategorized"].append(question["id"]) continue best_keyword = None rarest_freq = 1 for keyword in cluster_options: if word_frequency(keyword, "en") < rarest_freq: rarest_freq = word_frequency(keyword, "en") best_keyword = keyword if best_keyword in cluster_additions: cluster_additions[best_keyword].append(question["id"]) else: cluster_additions[best_keyword] = [question["id"]] return cluster_additions
def calc_flexibility_and_elaboration(responses, target_word, nlp): """Calculate flexibility (spacy similarity corrected for chance similarity) and elaboration (number of words). Arguments --------- responses: list target_word: string nlp: Spacy model Returns ------- pandas dataframe with 3 columns: clean_response, elaboration, flexibility """ data = pd.DataFrame({'clean_response': clean_text(response, nlp)} for response in responses) data['elaboration'] = data.apply( lambda row: int(len(row.clean_response.split())) if row.clean_response is not None else None, axis=1) # to control for effects of response length (elaboration) on semantic similarity, calculate similarity expected by # chance for all given response lengths to subtract from response similarity # (Forthmann et al, 2018 https://doi.org/10.1002/jocb.240) word_counts = data.elaboration.unique() bootstrapped_sims = bootstrap_similarity(word_counts, target_word) data['raw_similarity'] = data.apply( lambda row: calc_similarity(row.clean_response, target_word, nlp), axis=1) data['corrected_similarity'] = data.apply( lambda row: row.raw_similarity - bootstrapped_sims[row.elaboration] if not np.isnan(row.elaboration) else row.raw_similarity, axis=1) # flexibility is dissimilarity score, so invert the similarity score to get flexibility data['flexibility'] = (1 - abs(data['corrected_similarity'])) return data[['clean_response', 'elaboration', 'flexibility']]
def clean_data(remove_punctuation_marks, remove_stopwords): start = time.time() data_path = get_path(remove_punctuation_marks, remove_stopwords) for year in range(2018, 2019): data = read_data(year) entries = 0 buffers = [] mw_buffer = [] f = open(data_path + str(year) + '.csv', 'a', errors='ignore') prev = start print('cleaning ', year) for i in range(0, len(data)): orig_text = ' '.join(data[i][2:]).replace('"', '').replace("'", '') if (orig_text.strip() != ''): clean_entry, _ = clean_text( orig_text, clean_only=True, remove_stopwords=remove_stopwords, remove_punctuation_marks=remove_punctuation_marks) print(i, "/", entries, " took: ", time.time() - prev, " len tokens: ", len(clean_entry)) prev = time.time() if (len(clean_entry) > 1): entries += 1 buffers.append(clean_entry) if ((len(buffers) + 1) % 3 == 0): np.savetxt(f, buffers, fmt='%s', delimiter=",") buffers = [] else: print(orig_text) f.close() end = time.time() print('took overall: ', end - start) return entries
def predict(self, x): predicts = [] for test_str in x: test_str = "<s> " + test_str + " </s>" a = clean_text.clean_text(test_str) print(a) a = a.split(' ') x_seq = numpy.zeros((len(a), self.n_in), dtype='float64') y_seq = numpy.zeros((len(a),), dtype='int32') for i in range(len(a)): word = a[i] if word in self.word2label: y_seq[i] = self.word2label[word] else: y_seq[i] = self.word2label["%%%"] if word in self.model: x_seq[i, :] = self.model[word] else: x_seq[i, :] = self.model["xxxxx"] predicts.append(self.f_pred(*[x_seq, y_seq])) return predicts
def get_cleaned_subject(tarpath, header): email = eh.read_email(tarpath) subject = eh.extract_header(email, header) if len(subject) != 0: cleaned = ct.clean_text(subject[0]) return cleaned if len(subject) == 0: return []
def get_current_normalized_skill(skill, punc_marks, stopwords, multiword): norm = " ".join( clean_text(skill, remove_stopwords=not stopwords, remove_punctuation_marks=not punc_marks)[0]) if (multiword and len(norm.split()) > 1): norm, _ = get_multiword_tokens(norm.split()) return norm else: return norm.split()
def tokenize_space(sentence): '''Return cleaned and tokenized sentences Example: >>> s = '1987 본 문 대통령.."그런다고 바뀌나? 함께 하면 바뀐다"' >>> tokenize_clean_text(s) ['1987', '본', '문', '대통령', '그런다고', '바뀌나', '함께', '하면', '바뀐다'] ''' sentence = clean_text(sentence) sent_tokened = [i for i in sentence.split(' ') if len(i) > 0] return sent_tokened
def clean_the_data(dataframe): """ INPUT: a dataframe with unparsed text in the description field OUTPUT: a dataframe with cleaned text in the description field """ for i in dataframe.index: doc = ct.clean_text(dataframe.description.ix[i]) dataframe.description.ix[i] = doc return dataframe
def clean_training_data(multiword, punc_marks, stopwords, window): entries = read_training_data() for i in range(0, len(entries)): print(i) tokens, pos_tags = clean_text(entries[i][2], remove_stopwords=not stopwords, remove_punctuation_marks=not punc_marks) if (multiword): tokens, pos_tags = get_multiword_tokens(tokens, pos_tags) write_clean_tokens_to_file(tokens, pos_tags, multiword, punc_marks, stopwords, window) print('done')
def process_docs(self): """Process docs that have an id attribute and a question attribute.""" self.documents = [] if self._raw_docs and self._nlp: for doc in self._raw_docs: doc["question"] = clean_text(doc["question"].replace("\n", "")) if len(doc["question"]) == 0: doc["question"] = " " processed_doc = self._nlp(doc["question"]) processed_doc._.tag = doc["id"] processed_doc._.lemmatized = ' '.join( [d.lemma_ for d in processed_doc]) self.documents.append(processed_doc)
def read_classification_data(data_path, remove_punctuation_marks=True, remove_stopwords=True, multiword=False): p = data_path.rpartition('.') clean_data_path = p[0] + '_cleaned.' + p[2] if (not path.exists(p[0] + '_cleaned.' + p[2])): print('no cleaned data found for: ' + data_path) f = open(data_path, 'r') data = list(csv.reader(f, quoting=csv.QUOTE_NONE, delimiter=';')) f.close() print('Found ', len(data), "entries.") f = open(clean_data_path, 'a', errors='ignore') prev = time.time() entries = 0 buffers = [] for i in range(0, len(data)): orig_text = data[i][1].replace('"', '').replace("'", '') clean_entry, _ = clean_text( orig_text, clean_only=True, remove_stopwords=remove_stopwords, remove_punctuation_marks=remove_punctuation_marks) print(i, "/", entries, " took: ", time.time() - prev, " len tokens: ", len(clean_entry)) prev = time.time() if (len(clean_entry) > 1): entries += 1 buffers.append(clean_entry) if ((len(buffers) + 1) % 3 == 0): np.savetxt(f, buffers, fmt='%s', delimiter=",") buffers = [] f.close() else: print('clean data found at: ' + clean_data_path) entries = [] for year in range(2017, 2018): if (not multiword): f = open(clean_data_path, 'r') f = (l.replace('\0', '') for l in f) entries = entries + list(csv.reader(f, quotechar="'")) for i, data in enumerate(entries): entries[i] = [ d.replace("'", '').replace("[", "").replace("]", "").strip() for d in data ] f.close() return entries
def prepare_text(type): print("[*] Concatenation started\n") if type.lower() == 'train': inpath = "train" dirs = (file for file in listdir(inpath) if isdir(join(inpath, file))) for directory in dirs: print(directory) concatenate_files( join(inpath, directory), join(inpath, directory, 'Result', 'input_' + directory.lower() + '_tmp.txt')) clean_text(join(inpath, directory, 'Result', 'input_' + directory.lower() + '_tmp.txt'), join(inpath, directory, 'Result', 'input_' + directory.lower() + '.txt'), zip_files=False) remove( join(inpath, directory, 'Result', 'input_' + directory.lower() + '_tmp.txt')) elif type.lower() == 'test': inpath = "test" filename = "test_author" concatenate_files(inpath, join(inpath, 'Result', filename + '_tmp.txt')) clean_text(join(inpath, 'Result', filename + '_tmp.txt'), join(inpath, 'Result', filename + '.txt'), zip_files=False) remove(join(inpath, 'Result', filename + '_tmp.txt')) print("\n[*] Concatenation ended")
def export_ds(subsets, phase): raw_text = { "world" : '', "sports" : '', "business": '', "sci": '' } for data_label, subset in zip(subsets, raw_text): for text in subsets[data_label]: text = clean_text(text) raw_text[subset] += f'\n\n{text}' for subset in raw_text: dump_data(phase, subset, raw_text[subset])
def process_title(title, article): title_set = clean_text(title) id = frozenset_to_filename(frozenset(title_set)) if len(id) >= 254 or len(pathname2url(id)) >= 254: return "error" else: try: title_entry = titles_dict.get(id, Title(id=id, words=title_set)) if article.id not in set(title_entry.articles): title_entry.add_article(article.id) titles_dict[id] = title_entry add_title_to_words(title_entry) return title_entry except: return "error"
def get_search_result(query): """ Main Program """ cleaned_query = clean_text(query) tag_name = get_tag(cleaned_query) # If tag name is None, the whole Title column # is given as input to search result to # find similarity with the entire corpus if tag_name: df = subset_df(tag_name) output_var = search_result(df,cleaned_query) else: output_var = search_result(medium_cleaned,cleaned_query) return output_var
def export_ds(subset, groups): for entry in groups: dataset, _ = fetch_20newsgroups(data_home='./20ng_od', subset=subset, categories=entry['names'], remove=('headers', 'footers', 'quotes'), return_X_y=True) corpus = '' for article in dataset: stripped = re.sub('\s+', ' ', article) stripped = clean_text(stripped) corpus += f'\n\n{stripped}' full_path = os.path.join('./20ng_od', subset) if not os.path.exists(full_path): os.makedirs(full_path) with open(os.path.join(full_path, f'{entry["topic"]}.txt'), 'w') as f: f.write(corpus)
def LDA(text): from gensim.models import ldamodel from clean_text import clean_text from LDA_AuxFunc import sent_to_words, remove_stopwords, make_bigrams, lemmatize import gensim.corpora as corpora import warnings import logging logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level = logging.ERROR) warnings.filterwarnings("ignore", category = DeprecationWarning) text = clean_text(text) data_words = list(sent_to_words(text)) data_words_nostops = remove_stopwords(data_words) data_words_bigrams = make_bigrams(data_words, data_words_nostops) data_lemmatized = lemmatize(data_words_bigrams, allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']) id2word = corpora.Dictionary(data_lemmatized) corpus = [id2word.doc2bow(text) for text in data_lemmatized] lda_model = ldamodel.LdaModel(corpus = corpus, id2word = id2word, num_topics = 3, random_state = 1, update_every = 1, chunksize = 50, passes = 20, alpha = 'auto', per_word_topics = True) return (lda_model, corpus, id2word)
print("Read label2word ...") with open("label2word.txt", 'r', encoding='utf-8') as file: for line in file: lines = line.strip('\n').split('\t') label = lines[0] word = lines[1] word2label[word] = label label2word[label] = word print("Read training data ...") #with open('train_small.txt', 'r', encoding='UTF-8') as file: with open('training.txt', 'r', encoding='UTF-8') as file: for line in file: a = clean_text.clean_text(line) a = a.split(' ') if len(a) < 5: continue x_seq_list.append(a) ''' x_seq = np.zeros((len(a), word_vec_len), dtype='float64') y_seq = np.zeros((len(a),), dtype='int32') for i in range(len(a)): word = a[i] if word in word2label: y_seq[i] = word2label[word]
# Convet "<s>" and "</s>" to "." word2label["."] = labelCount label2word[labelCount] = "." labelCount += 1 # Other words are set "%%%" word2label["%%%"] = labelCount label2word[labelCount] = "%%%" labelCount += 1 # Map word to labelID with open('MLDS_Final/sentence/train_clean.set', 'r', encoding='UTF-8') as file: for line in file: a = clean_text.clean_text(line) a = a.split(' ') for i in range(len(a)): word = a[i] if word not in word2label: word2label[word] = labelCount label2word[labelCount] = word labelCount += 1 n_hidden = 100 n_in = word_vec_len n_out = len(label2word) RNN = MetaRNN(n_in=n_in, n_hidden=n_hidden, n_out=n_out, activation='tanh', output_type='softmax', use_symbolic_softmax=True)
import clean_text as ct import parse_and_store as ps import word_vectors as wv import calculate_score as cs import pickle import gensim description_model = gensim.models.Word2Vec.load('word_model') """ INPUT: the raw text to be analyzed OUTPUT: a string saying whether the text is fraudulent or not """ if __name__ = "__main__": document = raw_input("enter the document to be analyzed: ") clean_doc = ct.clean_text(document) doc_df = ps.store_and_parse(clean_doc) features = wv.write_word_vectors(doc_df, description_model) model = pickle.load( open( 'MODEL', 'rb' ) ) y_pred = model.predict(features) scoring_df = cs.scoring_df(doc_df, y_pred) cs.calculate_frauds(scoring_df)
def get_cleaned_readings(self, n): df_raw = download_readings(n) df = df_raw.copy() for each in df_raw.columns: df[each] = df[each].apply(lambda x: clean_text(x)) return df
def get_cleaned_body(tarpath): email = eh.read_email(tarpath) body = eh.extract_body(email) cleaned = ct.clean_text(body) return cleaned
location, location_type = dataset_helpers.get_tweet_location(hit) username, verified = dataset_helpers.get_tweet_user(hit) tweets.append((text, username, verified, location, location_type)) if args.numresults != -1 and i + 1 == args.numresults: break num_tweets = len(tweets) print("{0} tweets found.".format(num_tweets)) print() #clean the text print("Cleaning text...") print() if args.modeltype == "Word2Vec": tweets = [(clean_text(t, normalize_case=True, blacklist_regex="non_alpha_numeric"), u, v, clean_text(l), p) for t, u, v, l, p in tweets] elif args.modeltype == "TFHub": tweets = [(clean_text(t), u, v, clean_text(l), p) for t, u, v, l, p in tweets] else: raise "Unknown model type." #filter out empty tweets and replace empty locations tweets = [t for t in tweets if t[0] != ""] tweets = [(t, u, v, l if l != "" else "[No location available]", p) for t, u, v, l, p in tweets] empty_tweets = num_tweets - len(tweets) if empty_tweets > 0: print("Removed {0} empty tweet(s).".format(empty_tweets))
repos_dir = sys.argv[1] dest = sys.argv[2] for repo_dir in os.listdir(repos_dir): full_path = os.path.join(repos_dir, repo_dir) if not os.path.isdir(full_path): continue dest_path = os.path.join(dest, repo_dir) with open(dest_path, 'w') as out_handle: total_read = 0 for root, dirs, files in os.walk(full_path): for filename in files: f_full_path = os.path.join(root, filename) try: with open(f_full_path) as handle: everything = handle.read(100000) # read 1 mb everything = ' '.join(clean_text(everything)) total_read += len(everything.encode('utf-8')) out_handle.write(everything) out_handle.write(' ') if total_read > 1000000: break except: pass if total_read > 1000000: break
print("Read label2word ...") with open("label2word.txt", 'r', encoding='utf-8') as file: for line in file: lines = line.strip('\n').split('\t') label = lines[0] word = lines[1] word2label[word] = label label2word[label] = word print("Read training data ...") with open('train_small.txt', 'r', encoding='UTF-8') as file: #with open('training.txt', 'r', encoding='UTF-8') as file: for line in file: a = clean_text.clean_text(line) a = a.split(' ') if len(a) < 5: continue x_seq = np.zeros((len(a), word_vec_len), dtype='float64') y_seq = np.zeros((len(a),), dtype='int32') for i in range(len(a)): word = a[i] if word in word2label: y_seq[i] = word2label[word] else: y_seq[i] = word2label["%%%"]
# Convet "<s>" and "</s>" to "." word2label["."] = labelCount label2word[labelCount] = "." labelCount += 1 # Other words are set "%%%" word2label["%%%"] = labelCount label2word[labelCount] = "%%%" labelCount += 1 # Map word to labelID with open('MLDS_Final/sentence/train_clean.set', 'r', encoding='UTF-8') as file: for line in file: a = clean_text.clean_text(line) a = a.split(' ') for i in range(len(a)): word = a[i] if word not in word2label: word2label[word] = labelCount label2word[labelCount] = word labelCount += 1 n_hidden = 100 n_in = word_vec_len n_out = len(label2word) RNN = MetaRNN(n_in=n_in, n_hidden=n_hidden, n_out=n_out,