def vectorize_words(self, clean_profiles, max_features = 500) : # Vectorize the words in the cleaned profiles using # term frequency/inverse document frequency (TF-IDF) print "Creating the bag of words...\n" # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool. vectorizer = TfidfVectorizer(min_df=1, max_features = max_features) vectorizer._validate_vocabulary() # fit_transform() does two functions: First, it fits the model # and learns the vocabulary; second, it transforms our training data # into feature vectors. The input to fit_transform should be a list of # strings. data_features = vectorizer.fit_transform(clean_profiles) # Numpy arrays are easy to work with, so convert the result to an # array data_features = data_features.toarray() print data_features.shape vocab = vectorizer.get_feature_names() # Sum up the counts of each vocabulary word dist = np.sum(data_features, axis=0) return vectorizer, data_features, vocab, dist
def search(): main() global gv, g_matrix, gv_stemmed, g_matrix_stemmed, terms, stemmed_terms, sparse_td_matrix gv = TfidfVectorizer(lowercase=True, sublinear_tf=True, use_idf=True, norm="l2") g_matrix = gv.fit_transform(both_data).T.tocsr() # gv_stemmed = TfidfVectorizer(lowercase=True, sublinear_tf=True, use_idf=True, norm="l2") # g_matrix_stemmed = gv_stemmed.fit_transform(stemmed_data).T.tocsr() gv._validate_vocabulary() #gv_stemmed._validate_vocabulary() # Validate stemmed vocabulary terms = gv.get_feature_names() #stemmed_terms = gv_stemmed.get_feature_names() # Get the stemmed feature names sparse_td_matrix = sparse_matrix.T.tocsr() #while True: global matches matches = [] stemmed = True boolean = 0 inp = request.args.get('query') #inp = input("Search for a document: ") # asks user for input if inp: # break both = "" for each in inp.split(): if re.match('["][\w\s]+|[\w\s]+["]|["][\w\s]+["]', each): # Checks if input has quotation marks both += each.strip('"') + " " stemmed = False # Sets the input to search unstemmed documents (exact matches) else: both += stemmer.stem(each) + " " if each in d.keys(): # checks for any boolean operators boolean += 1 inp = both.strip() inp = re.sub('"', '', inp) # Removes quotation marks words_known = check_for_unknown_words(each.strip('"').lower(), stemmed) # Check if the token is in corpus, if words_known == False: # if it's not, stop loop & store the value as FALSE matches.append('Word "{}" is not found in corpus'.format(each)) break if stemmed == True: # Stem the query stemmed_inp = " ".join(stemmer.stem(each) for each in inp.split()) # stems every word if query is a multi-word phrase inp = stemmed_inp #for t in inp.split(): # checks for any boolean operators # if t in d.keys(): # boolean += 1 # break if boolean != 0 and words_known: search_wikicorpus(inp, stemmed) if boolean == 0: term = inp.split() #if stemmed: #gv_stemmed.ngram_range = (len(term), len(term)) #g_matrix_stemmed = gv_stemmed.fit_transform(stemmed_data).T.tocsr() #else: gv.ngram_range = (len(term), len(term)) g_matrix = gv.fit_transform(both_data).T.tocsr() if words_known: search_wikicorpus(inp, stemmed) og_inp = request.args.get('query') # retrieve_articles() doesnt work with stems (yet) try: retrieve_articles(og_inp) # Prints the first few lines if there are exact matches in the articles except SyntaxError: pass return render_template('index.html', matches=matches)
def main(): corpus = open("corpus/wikicorpus.txt", "r", encoding='UTF-8') articles_str = "" for line in corpus: if re.search(r'<article name="', line): no_tags = re.sub(r'<article name="', "", line) no_tags_2 = re.sub(r'">', "", no_tags) articles_str += no_tags_2 else: articles_str += line global articles articles = articles_str.split("</article>") global corpus_with_names corpus_with_names = {} for article in articles: lines = article.split('\n') if article == articles[0]: corpus_with_names[lines[0]] = ''.join(lines[1:]) else: corpus_with_names[lines[1]] = ''.join(lines[2:]) articles.pop() global articlenames, gv, gv_stemmed, g_matrix, g_matrix_stemmed articlenames = list(corpus_with_names.keys()) articledata = list(corpus_with_names[name] for name in articlenames) global stemmer stemmer = SnowballStemmer("english") documents = stem_documents() article_names = list(documents.keys()) stemmed_data = list(documents[name] for name in article_names) global both_versions both_versions = {} # dictionary with both normal and stemmed articles for article in corpus_with_names: tokens_2 = corpus_with_names[article].split() stemmed_data_2 = ' '.join(stemmer.stem(t) for t in tokens_2) both_versions[article] = corpus_with_names[article] + stemmed_data_2 both_names = list(both_versions.keys()) both_data = list(both_versions[name] for name in both_names) gv = TfidfVectorizer(lowercase=True, sublinear_tf=True, use_idf=True, norm="l2") g_matrix = gv.fit_transform(both_data).T.tocsr() gv_stemmed = TfidfVectorizer(lowercase=True, sublinear_tf=True, use_idf=True, norm="l2") g_matrix_stemmed = gv_stemmed.fit_transform(stemmed_data).T.tocsr( ) # Create a separate matrix for the stemmed data cv = CountVectorizer(lowercase=True, binary=True) gv._validate_vocabulary() gv_stemmed._validate_vocabulary() # Validate stemmed vocabulary sparse_matrix = cv.fit_transform(articles) binary_dense_matrix = cv.fit_transform(articles).T.todense() dense_matrix = cv.fit_transform(articles).T.todense() global terms terms = gv.get_feature_names() global stemmed_terms stemmed_terms = gv_stemmed.get_feature_names( ) # Get the stemmed feature names global sparse_td_matrix sparse_td_matrix = sparse_matrix.T.tocsr() global d d = { "and": "&", "AND": "&", "or": "|", "OR": "|", "not": "1 -", "NOT": "1 -", "(": "(", ")": ")" } # operator replacements global t2i t2i = cv.vocabulary_ # query_stemmed = input("Search stemmed documents? y/n: ") # Asks whether user would like to search stemmed results # if query_stemmed == "y": # stemmed = True # else: # stemmed = False while True: stemmed = True boolean = 0 inp = input("Search for a document: ") # asks user for input if inp == '': break both = "" for each in inp.split(): if re.match('["][\w\s]+["]', each): # Checks if input has quotation marks both += each.strip('"') + " " else: both += stemmer.stem(each) + " " inp = both.strip() inp = re.sub('"', '', inp) # Removes quotation marks stemmed = False # Sets the input to search unstemmed documents (exact matches) if stemmed == True: # Stem the query stemmed_inp = " ".join(stemmer.stem(each) for each in inp.split( )) # stems every word if query is a multi-word phrase inp = stemmed_inp if check_for_unknown_words(inp, stemmed) == True: for t in inp.split(): if t in d.keys(): retrieve_articles(inp) boolean += 1 break if boolean == 0 and len( inp.split()) == 1: # if the query consists of 1 word if stemmed: gv_stemmed.ngram_range = (1, 1) g_matrix_stemmed = gv_stemmed.fit_transform( stemmed_data).T.tocsr() else: gv.ngram_range = (1, 1) g_matrix = gv.fit_transform(both_data).T.tocsr() search_wikicorpus(inp, stemmed) elif boolean == 0: # if the query is a multi-word phrase term = inp.split() if stemmed: gv_stemmed.ngram_range = (len(term), len(term)) g_matrix_stemmed = gv_stemmed.fit_transform( stemmed_data).T.tocsr() else: gv.ngram_range = (len(term), len(term)) g_matrix = gv.fit_transform(both_data).T.tocsr() search_wikicorpus(inp, stemmed)
def search(): full_filename_gif = os.path.join(app.config['UPLOAD_FOLDER'], 'twitter_bird.gif') full_filename_png = os.path.join(app.config['UPLOAD_FOLDER'], 'twitter_image.png') if selected_language != 'en': languages.pop(languages.index(selected_language)) languages.pop(languages.index('en')) languages.sort() languages.insert(0, selected_language) languages.insert(1, 'en') elif selected_language == 'en': languages.pop(languages.index('en')) languages.sort() languages.insert(0, 'en') global tweets_data, tweets_id tweets_data = lang_tweet_dict[selected_language] tweets_id = lang_id_dict[selected_language] global gv, g_matrix, terms gv = TfidfVectorizer(lowercase=True, sublinear_tf=True, use_idf=True, norm="l2") g_matrix = gv.fit_transform(tweets_data).T.tocsr() gv._validate_vocabulary() terms = gv.get_feature_names() global matches matches = [] words_known = False error = "" stemmed = True plot_2 = selected_language == 'en' # Draw themes plot only if language is English inp = request.args.get('query') if inp: both = "" for each in inp.split(): if re.match( '["][\w\s]+|[\w\s]+["]|["][\w\s]+["]', each) or selected_language not in stemmer_dict.keys( ): # Checks if input has quotation marks and is stemmable both += each.strip('"') + " " stemmed = False # Sets the input to search unstemmed documents (exact matches) else: stemmer = SnowballStemmer(stemmer_dict[selected_language]) both += stemmer.stem(each) + " " inp = both.strip() inp = re.sub('"', '', inp) # Removes quotation marks words_known = check_for_unknown_words( each.strip('"').lower()) # Check if the token is in corpus, if words_known == False: # if it's not, stop loop & store the value as FALSE if selected_language in stemmer_dict.keys( ): # Inform user which language was used as corpus language = stemmer_dict[selected_language][0].upper( ) + stemmer_dict[selected_language][1:] else: language = selected_language.upper() error = 'Word "{}" is not found in {} corpus.'.format( each, language) break if stemmed == True: # Stem the query stemmed_inp = " ".join(stemmer.stem(each) for each in inp.split( )) # stems every word if query is a multi-word phrase inp = stemmed_inp if len(inp.split()) > 1: term = inp.split() gv.ngram_range = (len(term), len(term)) g_matrix = gv.fit_transform(tweets_data).T.tocsr() multiword_terms = gv.get_feature_names() if inp not in multiword_terms: error = f'Phrase "{inp}" is not found in corpus.' words_known = False else: words_known = True if words_known: term = inp.split() gv.ngram_range = (len(term), len(term)) g_matrix = gv.fit_transform(tweets_data).T.tocsr() search_wikicorpus(inp) return render_template('index.html', matches=matches, languages=languages, countries=countries, words_known=words_known, error=error, plot_2=plot_2, full_filename_gif=full_filename_gif, full_filename_png=full_filename_png, selected_language=selected_language)