def basic_features(): # Spelling corrector df_all['search_term'] = df_all['search_term'].apply(lambda x : spellingCorrector.correctSpelling(x)) # Remove stop words df_all['product_description'] = df_all['product_description'].apply(lambda x: utils.remove_stop_words(x)) df_all['product_title'] = df_all['product_title'].apply(lambda x: utils.remove_stop_words(x)) df_all['search_term'] = df_all['search_term'].apply(lambda x: utils.remove_stop_words(x)) # Stem words df_all['product_description'] = df_all['product_description'].apply(lambda x: utils.str_stem(x))
def similarity_score(t, s): """Returns a similarity score for a given sentence. similarity score = the total number of tokens in a sentence that exits within the title / total words in title """ t = utils.remove_stop_words(t.lower()) s = utils.remove_stop_words(s.lower()) t_tokens, s_tokens = t.split(), s.split() similar = [w for w in s_tokens if w in t_tokens] if len(t_tokens) == 0: return 0 score = (len(similar) * 0.1) / float(len(t_tokens)) return score
def compute_word_to_weight_helper(self, word_to_weight, line, section_index): line = re.sub('[^0-9a-zA-Z]+', ' ', line) line = utils.remove_stop_words(line) if self.section_weights[section_index] == 0: return word_weight = self.section_weights[section_index] tokens = nltk.word_tokenize(line) words = [0] * len(tokens) stemmer = nltk.stem.lancaster.LancasterStemmer() for i in range(len(tokens)): words[i] = stemmer.stem(tokens[i]) # Also add 2 and 3 sized word grams into the features for index, word in enumerate(words): for new_index in xrange(index, index + self.max_word_gram_size): # Ensure words are not out of range if (len(words) <= new_index): continue word_gram = words[index : new_index + 1] key = ' '.join(word_gram) if self.sectionalize: key = str(section_index) + '_' + key if key in word_to_weight: word_to_weight[key] += word_weight else: word_to_weight[key] = word_weight
def get_cleaned_docs_from_db(): documents = session.query(Document).all() raw_filenames = [doc.to_dict()['raw_filename'] for doc in documents] cleaned_docs = [] for raw_filename in raw_filenames: k = Key(b) k.key = 'documents/{}'.format(raw_filename) try: raw = k.get_contents_as_string() soup = BeautifulSoup(raw, 'html.parser') title, doc = get_content(soup) print title doc = clean_document(doc) cleaned_doc = utils.remove_stop_words(doc) cleaned_docs.append(cleaned_doc) except Exception as e: print e vsm = VSMapping() vsm.batch_fit(cleaned_docs) transformations = [vsm.transform(doc) for doc in cleaned_docs] for i, (vid, vector) in enumerate(transformations): # Update Document vector_id document_id = documents[i].to_dict()['id'] print document_id, vid session.query(Document).filter(Document.id == document_id).update({'vector_id': vid}) session.commit() print len(cleaned_docs)
def upvote(): try: df = utils.read_data(FILE) if len(request.form["search_type"]) > 0 and isinstance(request.form["search_type"], str) == True and str(request.form["search_type"]) in ["label_choice", "name_choice", "research_choice"]: search_type = request.form["search_type"] if search_type == "label_choice": send_msg = request.form["wine_name"] answer = str(send_msg) elif search_type == "name_choice": answer = df.loc[df['name'] == str(request.form["wine_name"])] answer = answer[['name', 'image', 'pays','region', 'appelation', 'domaine', 'millesime', 'couleur','description']].head(10) elif search_type == "research_choice": couleur = str(request.form["couleur"]) pays = str(request.form["pays"]) want = str(request.form["want"]) want = utils.remove_stop_words(want, STOP) want = str(utils.process_request(want)) answer = utils.generate_research_choice(df, couleur, pays) if len(want) > 0: answer = utils.treat_input(answer,want,'clean_text') else: pass else: pass return utils.generate_answer(answer) else: return False except: return False
def get_cleaned_docs_from_db(): documents = session.query(Document).all() raw_filenames = [doc.to_dict()['raw_filename'] for doc in documents] cleaned_docs = [] for raw_filename in raw_filenames: k = Key(b) k.key = 'documents/{}'.format(raw_filename) try: raw = k.get_contents_as_string() soup = BeautifulSoup(raw, 'html.parser') title, doc = get_content(soup) print title doc = clean_document(doc) cleaned_doc = utils.remove_stop_words(doc) cleaned_docs.append(cleaned_doc) except Exception as e: print e vsm = VSMapping() vsm.batch_fit(cleaned_docs) transformations = [vsm.transform(doc) for doc in cleaned_docs] for i, (vid, vector) in enumerate(transformations): # Update Document vector_id document_id = documents[i].to_dict()['id'] print document_id, vid session.query(Document).filter(Document.id == document_id).update( {'vector_id': vid}) session.commit() print len(cleaned_docs)
def test(X): jaccard_scores = [] for article in X: scores = [] for sentence1, sentence2 in zip(article, article[1:]): stopped_sentence1 = utils.remove_stop_words(sentence1) stemmed_sentence1 = utils.stem_tokens(stopped_sentence1) stopped_sentence2 = utils.remove_stop_words(sentence2) stemmed_sentence2 = utils.stem_tokens(stopped_sentence2) scores.append(utils.jaccard(stemmed_sentence1, stemmed_sentence2)) if scores: jaccard_scores.append(np.average(scores)) else: jaccard_scores.append(0.1) return np.array(jaccard_scores).reshape(len(jaccard_scores), 1)
def show_frecuencia(): words = request.args.getlist('word') date = request.args.get('date') accounts = request.args.getlist('account') polaridad = request.args.getlist('polaridad') tweets = get_filtros(words, date, accounts, polaridad) users = get_tweets_with_its_user(tweets) n = [{"tweets": t, "user": u} for t, u in zip(tweets, users)] words3 = [] counts = [] for tweet in n: words2 = tweet["tweets"]["text"].split(" ") processedWords = utils.remove_stop_words(utils.emoji(utils.signos(utils.https(utils.numbers(words2))))) for word in processedWords: if word != '': if (word.lower() not in words3): words3.append(word.lower()) counts.append(1) else: index = words3.index(word.lower()) counts[index] += 1 ans = [{"_id": word, "count": count} for word, count in zip(words3, counts)] fd = nltk.FreqDist(counts) freqq = [] ans2 = [] for item in ans: num = item['count'] t = num/np.sum(counts) freqq.append(t) prom = np.mean(freqq) de = np.std(freqq) for item in ans: if freqq[ans.index(item)] > prom+de and item['_id'] not in words: ans2.append(item) return {"tweets": n, "data": ans2}
def __init__(self): self.index = CustomInvertedIndex() http = urllib3.PoolManager() page = 'http://www.imdb.com/search/title?groups=top_1000&sort=user_rating&view=advanced&page=' rows = [] for i in range(1, 21): url = page + str(i) raw_html = http.request('GET', url).data.decode('utf-8') soup = BeautifulSoup(raw_html, 'lxml') movie_tags = soup.find_all('div', class_='lister-item-content') for movie_tag in movie_tags: movie = movie_tag.a.string title_terms = remove_stop_words(movie.lower().split(' ')) directors_actors_tag = movie_tag.contents[9] directors = [] actors = [] currently_filling = directors for tag in directors_actors_tag.contents: if tag.name == 'span': currently_filling = actors continue if not isinstance(tag, bs4.element.NavigableString): person = tag.string.lower() currently_filling.append(person) genre_tag = movie_tag.contents[3].find(class_='genre') genres = genre_tag.string.lstrip('\n').rstrip().lower().split( ', ') # time to update the index with the parsed values self.index.update(title_terms, movie, Priority.Title.value) for actor in actors: self.index.update(actor.split(' '), movie, Priority.Actor.value) for director in directors: self.index.update(director.split(' '), movie, Priority.Director.value) self.index.update(genres, movie, Priority.Genre.value)
def question_c(): logging.info( "<Question C> Getting the significance and TFxICF representation") all_categories = train_full_set.target_names all_docs_per_category = [] classes_list = [ train_full_set.target_names.index("comp.sys.ibm.pc.hardware"), train_full_set.target_names.index("comp.sys.mac.hardware"), train_full_set.target_names.index("misc.forsale"), train_full_set.target_names.index("soc.religion.christian") ] logging.info( "Store data from all docs of a certain category as entries in all_data_category" ) for cat in all_categories: train_category = utils.fetch_data([cat])[0] data_category = train_category.data temp = '' for doc in data_category: temp += ' ' + doc all_docs_per_category.append(temp) logging.info("Now build frequency tables for each class") vectorized_newsgroups_train = utils.remove_stop_words( all_docs_per_category) print(vectorized_newsgroups_train.shape) max_term_freq_per_category = [0] * vectorized_newsgroups_train.shape[0] category_count_per_term = [0] * vectorized_newsgroups_train.shape[1] for i in range(vectorized_newsgroups_train.shape[0]): max_term_freq_per_category[i] = max( vectorized_newsgroups_train[i].data) category_count_per_term = vectorized_newsgroups_train.sum(axis=0) print(max_term_freq_per_category) print(category_count_per_term)
def collect(url, vsm): print url print('Downloading doc...') title, doc, raw_filename = download_document(url) print('Cleaning doc...') doc = clean_document(doc) cleaned_doc = utils.remove_stop_words(doc) print('Fitting VSM...') vsm.partial_fit(cleaned_doc) doc_vid, doc_vector = vsm.transform(doc) print('Summarize...') summary = summarize(title, doc, cleaned_doc, doc_vector, vsm.feature_names) save_document(title, summary, url, raw_filename, doc_vid) print 'done'
def __street_sweep(self, streetFlags, stopWords): """Takes a list of common street flags (road, street,...) and a list of phrasing words to remove from the address (into, the on, ...) and searches for possible addresses within the comment field. The longest address with a street number is then selected and the street and number fields are updated. """ i = 0 length = 0 num = False punc_comment = remove_punc(self.comment) while i < len(streetFlags): f_num = False punc_flag = remove_punc(streetFlags[i]) pat = re.compile(r"(\d{1,3}[a-zA-Z]?)?\s?([a-zA-Z]*\s?[a-zA-Z]+\s)"+ r"("+punc_flag+r")(,|\s|.|$)",re.IGNORECASE) match = re.search(pat,punc_comment) if match: if match.group(2): if match.group(1): f_num = True templine = remove_stop_words(match.group(2),stopWords) if templine: templine = templine.strip(' ') f_len = len(templine.split(' ')) else: f_len = 0 #Larger addresses are favoured and street numbers are # favoured above all else if (f_len > length and num == f_num) or (f_num and not num)\ and f_len > 0: self.road = templine +' '+match.group(3) num = f_num length = f_len if num: self.number = match.group(1) i+=1
def execute_query(self, query): start = time.time() query_tokens = process_and_tokenize_string(query) unprocessed_query_tokens = split_strings(query) self.logger.info(" Executing Query: '" + str(query) + "' ---- tokens:" + str(query_tokens)) top_docs = [TopPassage(doc) for doc in self.docs] question_class = -1 for i, wh in enumerate(wh_questions): if wh in unprocessed_query_tokens: question_class = i break #pos_list = nltk.pos_tag(unprocessed_query_tokens) tokens_synonyms = [] for token in remove_stop_words(unprocessed_query_tokens): tokens_synonyms += get_processed_synonyms(token) #print(tokens_synonyms) ngrams_vector = self.ngrams.query(query_tokens, self.docs) expanded_ngram_vector = self.ngrams.query(tokens_synonyms, self.docs) for i in range(len(top_docs)): progbar(i, len(top_docs)) top_docs[i].update_score(ScoreType.ngram, ngrams_vector[i]) top_docs[i].update_score(ScoreType.expanded_ngram, expanded_ngram_vector[i]) top_docs[i].calculate_score() print(' ') top_docs.sort(key=lambda x: x.score, reverse=True) end = time.time() self.logger.info("execute_query complete. elapsed time: " + str(end - start) + " secs") return top_docs
def init(self, title, snippets): with open(title, "rb") as f: p = pickle.load(f) self.titles_count = len(p) # 打乱数据 sidx = np.random.permutation(self.titles_count) shffled_original_data = [(p[i][0], p[i][1], p[i][2]) for i in sidx] for _ in shffled_original_data: self.titles_raw.append(_[0].strip().split()) self.titles_tok.append( utils.remove_stop_words(_[1].strip().split())) self.titles_id.append(_[2]) self.titles_average_len = sum( [len(self.titles_tok) + 0.0 for doc in self.titles_tok]) / self.titles_count with open(snippets, "rb") as f: self.id_snippets = pickle.load(f) for title in self.titles_tok: tmp = {} for word in title: tmp[word] = tmp.get(word, 0) + 1 # 存储每个文档中每个词的出现次数 self.f.append(tmp) self.titles_vec.append( utils.sent2matrix(self.w2v, self.embedding_dim, title)) for k in tmp.keys(): self.df[k] = self.df.get(k, 0) + 1 for k, v in self.df.items(): self.idf[k] = math.log(self.titles_count - v + 0.5) - math.log(v + 0.5) for title in self.titles_tok: self.titles_idf.append(utils.sentence_idf_vector(self.idf, title))
def load_data(args, mode='train'): text = [] usecols = list( map(lambda x: int(x), args.get('Data', 'usecols').split(','))) path = args.get('Data', 'dataset') + '/' + mode + '.csv' print('\n path: ' + path) data = pd.read_csv(path, usecols=usecols, encoding=args['Data'].get('encoding'), sep=args['Data'].get('csv_sep'), doublequote=True) labels = data.iloc[:, 0].tolist() if args.get('Data', 'dataset') == 'yelp': text = data.iloc[:, 1].tolist() elif args.get('Data', 'dataset') == 'ag_news': text = (data.iloc[:, 1] + ' ' + data.iloc[:, 2]).tolist() text = [remove_stop_words(t) for t in text] text = [remove_special_characters(t) for t in text] text = [remove_numbers(t) for t in text] text = [remove_single_letters(t) for t in text] text = [remove_single_characters(t) for t in text] return labels, text
def extract_key_words(file_name): """ Args: file_name: an mov file where the key words will be extracted Returns: a list of phrases that are extracted from the input video file. These are the extracted keywords """ # cut the lecture videos into different scenes scenes = VideoToText.find_scenes(file_name, min_scene_length=1, abs_min=0.75, abs_max=0.98, find_subscenes=True, max_subscenes_per_minute=12) # for each scene, get its dictionary scene_text_dict, frequent_patterns = VideoToText.scene_to_text(scenes) print(scene_text_dict) print( "Raw Dictionary ==================================================================================" ) # combine the multiple dictionary into one phraseDict = convert_dict_to_phraseDict(scene_text_dict) print(phraseDict) print( "Combined raw dictionary ==================================================================================" ) # clean up the each dictionary by removing non-letters and non-numbers phraseDict = process_phraseDict(phraseDict) print(phraseDict) print( "cleaned dictionaries ==================================================================================" ) # process the dictionary by comparing frequency with the brown corpus phraseList = utils.process_by_frequency(phraseDict) print(phraseList) print( "initial phrase list ==================================================================================" ) # process the phrase list by removing stop words phraseList = utils.remove_stop_words(phraseList) print(phraseList) print( "Final list after removing stop words ====================================================================" ) print() # extract frequent patterns and put them into a list print(frequent_patterns) print( "frequent patterns ===============================================================================" ) print() frequent_patterns_list = utils.patterns_to_list(frequent_patterns) # combine the frequent patterns with single phrase list into one list phraseList = phraseList + frequent_patterns_list print(phraseList) print( "Final phrase list ===============================================================================" ) print() return phraseList
def show_grafo(): words = request.args.getlist('word') date = request.args.get('date') accounts = request.args.getlist('account') polaridad = request.args.getlist('polaridad') tweets = get_filtros(words, date, accounts, polaridad) tweets_users = get_tweets_with_its_user(tweets) n = [{"tweets": t, "user": u} for t, u in zip(tweets, tweets_users)] # usuarios de los tweets userIds = [] for tweet in tweets: userId = tweet['userId'] if userId not in userIds: userIds.append(userId) users = [] for userId in userIds: userInfo = searchUserId(userId) if len(userInfo) > 0: userTweets = [] for tweet in tweets: if userId == tweet['userId']: userTweets.append(tweet['text']) users.append({'user': '******' + userInfo[0]['screen_name'], 'tweets': userTweets}) users2 = [] for user in users: filtered = [] for tweet in user['tweets']: words_text = list(tweet.split(" ")) filtered.append(utils.remove_stop_words(words_text)) palabras = filtered1(filtered) sin_num = utils.numbers(palabras) sin_imag = utils.https(sin_num) sin_puntos = utils.signos(sin_imag) sin_emojis = utils.emoji(sin_puntos) users2.append({'name': user['user'], 'words': sin_emojis}) users3 = [] for i in range(len(users2)): user1 = users2[i] words = [] for j in range(len(users2)): user2 = users2[j] if i != j: for word in user1['words']: for word2 in user2['words']: if word.lower() == word2.lower() and word.lower() not in words and word != "": words.append(word.lower()) users3.append({'name': user1['name'], 'words': words}) return {'tweets': n, 'data': users3}