Example #1
0
	def vectorize_words(self, clean_profiles, max_features = 500) :
		# Vectorize the words in the cleaned profiles using 
		# term frequency/inverse document frequency (TF-IDF)
		print "Creating the bag of words...\n"

		# Initialize the "CountVectorizer" object, which is scikit-learn's
		# bag of words tool.  
		vectorizer = TfidfVectorizer(min_df=1, max_features = max_features) 
		vectorizer._validate_vocabulary()

		# fit_transform() does two functions: First, it fits the model
		# and learns the vocabulary; second, it transforms our training data
		# into feature vectors. The input to fit_transform should be a list of 
		# strings.
		data_features = vectorizer.fit_transform(clean_profiles)

		# Numpy arrays are easy to work with, so convert the result to an 
		# array
		data_features = data_features.toarray()
		print data_features.shape

		vocab = vectorizer.get_feature_names()

		# Sum up the counts of each vocabulary word
		dist = np.sum(data_features, axis=0)

	
		return vectorizer, data_features, vocab, dist
Example #2
0
def search():

        main()

        global gv, g_matrix, gv_stemmed, g_matrix_stemmed, terms, stemmed_terms, sparse_td_matrix
        gv = TfidfVectorizer(lowercase=True, sublinear_tf=True, use_idf=True, norm="l2")
        g_matrix = gv.fit_transform(both_data).T.tocsr()
  
       # gv_stemmed = TfidfVectorizer(lowercase=True, sublinear_tf=True, use_idf=True, norm="l2")
       # g_matrix_stemmed = gv_stemmed.fit_transform(stemmed_data).T.tocsr()

        gv._validate_vocabulary()
        #gv_stemmed._validate_vocabulary() # Validate stemmed vocabulary

        terms = gv.get_feature_names()
        #stemmed_terms = gv_stemmed.get_feature_names() # Get the stemmed feature names
        sparse_td_matrix = sparse_matrix.T.tocsr()
        

    #while True:
        global matches
        matches = []
        stemmed = True
        boolean = 0
        inp = request.args.get('query')
        #inp = input("Search for a document: ")  # asks user for input
        if inp:
     #   break
            both = ""
            for each in inp.split():
                if re.match('["][\w\s]+|[\w\s]+["]|["][\w\s]+["]', each): # Checks if input has quotation marks
                    both += each.strip('"') + " "
                    stemmed = False # Sets the input to search unstemmed documents (exact matches)
                else:
                    both += stemmer.stem(each) + " "

                if each in d.keys(): # checks for any boolean operators
                    boolean += 1

                inp = both.strip()       
                inp = re.sub('"', '', inp) # Removes quotation marks

                words_known = check_for_unknown_words(each.strip('"').lower(), stemmed)     # Check if the token is in corpus,
                if words_known == False:                                                    # if it's not, stop loop & store the value as FALSE
                    matches.append('Word "{}" is not found in corpus'.format(each))
                    break

            if stemmed == True: # Stem the query
                stemmed_inp = " ".join(stemmer.stem(each) for each in inp.split()) # stems every word if query is a multi-word phrase
                inp = stemmed_inp

            #for t in inp.split(): # checks for any boolean operators
            #   if t in d.keys():
            #      boolean += 1
            #     break

            if boolean != 0 and words_known:
                search_wikicorpus(inp, stemmed)

            if boolean == 0:
                term = inp.split()
                #if stemmed:
                #gv_stemmed.ngram_range = (len(term), len(term))
                #g_matrix_stemmed = gv_stemmed.fit_transform(stemmed_data).T.tocsr()
            #else:
                gv.ngram_range = (len(term), len(term))
                g_matrix = gv.fit_transform(both_data).T.tocsr()

            if words_known:
                search_wikicorpus(inp, stemmed)

        og_inp = request.args.get('query')  # retrieve_articles() doesnt work with stems (yet)
        try:
            retrieve_articles(og_inp)  # Prints the first few lines if there are exact matches in the articles
        except SyntaxError:
            pass   

        return render_template('index.html', matches=matches)
Example #3
0
def main():

    corpus = open("corpus/wikicorpus.txt", "r", encoding='UTF-8')

    articles_str = ""
    for line in corpus:
        if re.search(r'<article name="', line):
            no_tags = re.sub(r'<article name="', "", line)
            no_tags_2 = re.sub(r'">', "", no_tags)
            articles_str += no_tags_2

        else:
            articles_str += line

    global articles
    articles = articles_str.split("</article>")

    global corpus_with_names
    corpus_with_names = {}
    for article in articles:
        lines = article.split('\n')
        if article == articles[0]:
            corpus_with_names[lines[0]] = ''.join(lines[1:])
        else:
            corpus_with_names[lines[1]] = ''.join(lines[2:])

    articles.pop()

    global articlenames, gv, gv_stemmed, g_matrix, g_matrix_stemmed
    articlenames = list(corpus_with_names.keys())
    articledata = list(corpus_with_names[name] for name in articlenames)

    global stemmer
    stemmer = SnowballStemmer("english")

    documents = stem_documents()
    article_names = list(documents.keys())
    stemmed_data = list(documents[name] for name in article_names)

    global both_versions
    both_versions = {}  # dictionary with both normal and stemmed articles

    for article in corpus_with_names:
        tokens_2 = corpus_with_names[article].split()
        stemmed_data_2 = ' '.join(stemmer.stem(t) for t in tokens_2)
        both_versions[article] = corpus_with_names[article] + stemmed_data_2

    both_names = list(both_versions.keys())
    both_data = list(both_versions[name] for name in both_names)

    gv = TfidfVectorizer(lowercase=True,
                         sublinear_tf=True,
                         use_idf=True,
                         norm="l2")
    g_matrix = gv.fit_transform(both_data).T.tocsr()

    gv_stemmed = TfidfVectorizer(lowercase=True,
                                 sublinear_tf=True,
                                 use_idf=True,
                                 norm="l2")
    g_matrix_stemmed = gv_stemmed.fit_transform(stemmed_data).T.tocsr(
    )  # Create a separate matrix for the stemmed data

    cv = CountVectorizer(lowercase=True, binary=True)
    gv._validate_vocabulary()
    gv_stemmed._validate_vocabulary()  # Validate stemmed vocabulary
    sparse_matrix = cv.fit_transform(articles)
    binary_dense_matrix = cv.fit_transform(articles).T.todense()
    dense_matrix = cv.fit_transform(articles).T.todense()

    global terms
    terms = gv.get_feature_names()

    global stemmed_terms
    stemmed_terms = gv_stemmed.get_feature_names(
    )  # Get the stemmed feature names

    global sparse_td_matrix
    sparse_td_matrix = sparse_matrix.T.tocsr()

    global d
    d = {
        "and": "&",
        "AND": "&",
        "or": "|",
        "OR": "|",
        "not": "1 -",
        "NOT": "1 -",
        "(": "(",
        ")": ")"
    }  # operator replacements

    global t2i
    t2i = cv.vocabulary_

    #    query_stemmed = input("Search stemmed documents? y/n: ")  # Asks whether user would like to search stemmed results
    #    if query_stemmed == "y":
    #        stemmed = True
    #    else:
    #        stemmed = False

    while True:
        stemmed = True
        boolean = 0
        inp = input("Search for a document: ")  # asks user for input
        if inp == '':
            break
        both = ""
        for each in inp.split():
            if re.match('["][\w\s]+["]',
                        each):  # Checks if input has quotation marks
                both += each.strip('"') + " "
            else:
                both += stemmer.stem(each) + " "

            inp = both.strip()

            inp = re.sub('"', '', inp)  # Removes quotation marks
            stemmed = False  # Sets the input to search unstemmed documents (exact matches)

        if stemmed == True:  # Stem the query
            stemmed_inp = " ".join(stemmer.stem(each) for each in inp.split(
            ))  # stems every word if query is a multi-word phrase
            inp = stemmed_inp

        if check_for_unknown_words(inp, stemmed) == True:
            for t in inp.split():
                if t in d.keys():
                    retrieve_articles(inp)
                    boolean += 1
                    break
            if boolean == 0 and len(
                    inp.split()) == 1:  # if the query consists of 1 word
                if stemmed:
                    gv_stemmed.ngram_range = (1, 1)
                    g_matrix_stemmed = gv_stemmed.fit_transform(
                        stemmed_data).T.tocsr()
                else:
                    gv.ngram_range = (1, 1)
                    g_matrix = gv.fit_transform(both_data).T.tocsr()
                search_wikicorpus(inp, stemmed)
            elif boolean == 0:  # if the query is a multi-word phrase
                term = inp.split()
                if stemmed:
                    gv_stemmed.ngram_range = (len(term), len(term))
                    g_matrix_stemmed = gv_stemmed.fit_transform(
                        stemmed_data).T.tocsr()
                else:
                    gv.ngram_range = (len(term), len(term))
                    g_matrix = gv.fit_transform(both_data).T.tocsr()
                search_wikicorpus(inp, stemmed)
Example #4
0
def search():
    full_filename_gif = os.path.join(app.config['UPLOAD_FOLDER'],
                                     'twitter_bird.gif')
    full_filename_png = os.path.join(app.config['UPLOAD_FOLDER'],
                                     'twitter_image.png')

    if selected_language != 'en':
        languages.pop(languages.index(selected_language))
        languages.pop(languages.index('en'))
        languages.sort()
        languages.insert(0, selected_language)
        languages.insert(1, 'en')
    elif selected_language == 'en':
        languages.pop(languages.index('en'))
        languages.sort()
        languages.insert(0, 'en')

    global tweets_data, tweets_id
    tweets_data = lang_tweet_dict[selected_language]
    tweets_id = lang_id_dict[selected_language]

    global gv, g_matrix, terms
    gv = TfidfVectorizer(lowercase=True,
                         sublinear_tf=True,
                         use_idf=True,
                         norm="l2")
    g_matrix = gv.fit_transform(tweets_data).T.tocsr()
    gv._validate_vocabulary()
    terms = gv.get_feature_names()

    global matches
    matches = []
    words_known = False
    error = ""
    stemmed = True
    plot_2 = selected_language == 'en'  # Draw themes plot only if language is English
    inp = request.args.get('query')
    if inp:
        both = ""
        for each in inp.split():
            if re.match(
                    '["][\w\s]+|[\w\s]+["]|["][\w\s]+["]',
                    each) or selected_language not in stemmer_dict.keys(
                    ):  # Checks if input has quotation marks and is stemmable
                both += each.strip('"') + " "
                stemmed = False  # Sets the input to search unstemmed documents (exact matches)
            else:
                stemmer = SnowballStemmer(stemmer_dict[selected_language])
                both += stemmer.stem(each) + " "

            inp = both.strip()
            inp = re.sub('"', '', inp)  # Removes quotation marks

            words_known = check_for_unknown_words(
                each.strip('"').lower())  # Check if the token is in corpus,
            if words_known == False:  # if it's not, stop loop & store the value as FALSE
                if selected_language in stemmer_dict.keys(
                ):  # Inform user which language was used as corpus
                    language = stemmer_dict[selected_language][0].upper(
                    ) + stemmer_dict[selected_language][1:]
                else:
                    language = selected_language.upper()
                error = 'Word "{}" is not found in {} corpus.'.format(
                    each, language)
                break

        if stemmed == True:  # Stem the query
            stemmed_inp = " ".join(stemmer.stem(each) for each in inp.split(
            ))  # stems every word if query is a multi-word phrase
            inp = stemmed_inp

        if len(inp.split()) > 1:
            term = inp.split()
            gv.ngram_range = (len(term), len(term))
            g_matrix = gv.fit_transform(tweets_data).T.tocsr()
            multiword_terms = gv.get_feature_names()
            if inp not in multiword_terms:
                error = f'Phrase "{inp}" is not found in corpus.'
                words_known = False
            else:
                words_known = True

        if words_known:
            term = inp.split()
            gv.ngram_range = (len(term), len(term))
            g_matrix = gv.fit_transform(tweets_data).T.tocsr()
            search_wikicorpus(inp)

    return render_template('index.html',
                           matches=matches,
                           languages=languages,
                           countries=countries,
                           words_known=words_known,
                           error=error,
                           plot_2=plot_2,
                           full_filename_gif=full_filename_gif,
                           full_filename_png=full_filename_png,
                           selected_language=selected_language)