Example #1
0
    def testKnownTFIDF(self):
        """
        Testing to see whether the tfidf values for arbitrarily selected words 
        in the articles correspond with manually calculated values.
        """
        articleList = []
        theList = []

        for string in self.strings:
            articleList.append(tfidf.tf(string))

        for string in self.theTwentyFive:
            theList.append(tfidf.tf(string))

        idfArtDict = tfidf.idf(articleList)
        idfTheDict = tfidf.idf(theList)

        tfidfArtList = tfidf.tfidf(idfArtDict, articleList)
        tfidfTheList = tfidf.tfidf(idfTheDict, theList)

        self.assertEqual(tfidfArtList[1]["Meditation"], math.log10(6/1) * (1/19))
        self.assertEqual(tfidfArtList[2]["books"], math.log10(6/1) * (1/18))
        self.assertEqual(tfidfArtList[5]["the"], math.log10(6/3) * (5/5))

        self.assertEqual(tfidfTheList[3]["the"], math.log10(5/5) * (5/5))
Example #2
0
def main():
	features = lsi(tfidf('small_train.txt'), 100)
	classes = get_classes('small_train.txt', 0.7, 5)
	prob  = svm_problem(classes, features)
	param = svm_parameter('-t 0 -c 4 -b 1')
	m = svm_train(prob, param)
	test_features = lsi(tfidf('small_test.txt'), 100)
	test_classes = get_classes('small_test.txt', 0.7, 5)
	p_label, p_acc, p_val = svm_predict(test_classes, test_features, m)
def get_acc(trainfilename, testfilename, d, a, r):
  features = lsi(tfidf('small_train.txt'), d)
  classes = get_classes('small_train.txt', a, r)
  prob  = svm_problem(classes, features)
  param = svm_parameter('-t 0 -c 4 -b 1')
  m = svm_train(prob, param)
  test_features = lsi(tfidf('small_test.txt'),d)
  test_classes = get_classes('small_test.txt', a, r)
  p_label, p_acc, p_val = svm_predict(test_classes, test_features, m)
  return p_acc
Example #4
0
def scrape():
	try:
		querystring = request.args.get("querystring")
		print("Recieved scrape query: " + querystring)
		scraper_values, stuff, index = scraper_df(querystring, 100)
		tfidf_values = tfidf(scraper_values)
		tfidf_pd_values = tfidf_df(scraper_values, stuff, querystring)
		db.insert({"querystring" : querystring, "tfidf" : tfidf_values, "tfidf_pd_values" : tfidf_pd_values})
		indexStorage[querystring] = index
		return jsonify({
			"tfidf" : tfidf_values,
			"specifics" : tfidf_pd_values
		})
	except Exception as e:
		print(e)
		queryer = Query()
		if len(db.search(queryer.querystring == querystring)):
			return jsonify({
					"tfidf" : db.search(queryer.querystring == querystring)[0]["tfidf"],
					"specifics" : db.search(queryer.querystring == querystring)[0]["tfidf_pd_values"]
				})
		else:
			return jsonify({
					"tfidf" : "ERROR",
					"specifics" : "ERROR"
				})
	print(scraper_values)
Example #5
0
def generate_weights(documents, lexicon):
    """ function: generate_weights
        --------------------------
        perform tf-idf to generate importance scores for words in documents

        :param document: list of documents to use in calculations
        :returns: dictionary of dictionaries: {"id_" : {"word" : score,...}}
    """
    # weight = { 'document' : { 'word' : score,... },... }
    weights = dict()
    m = tfidf()
    print('Adding documents for TF-IDF...')
    for i, document in enumerate(documents):
        m.addDocument(i,
                      document['words']['title'] + document['words']['body'])
        weights[i] = dict()
    # generate dictionary of { "word", "score" } pairs for each document
    print('Generating weight scores for words; This WILL take time...')
    for word in lexicon['title'] & lexicon['body']:
        m.get_similarities(word, weights, 'smooth', 1.25)
    for word in lexicon['title'] - lexicon['body']:
        m.get_similarities(word, weights, 'smooth', 1.1)
    for word in lexicon['body'] - lexicon['title']:
        m.get_similarities(word, weights, 'smooth')
    return weights
 def getDocumentSimilarity(self, queryTermsMap, trainDoc):
     tfIdfHelper = tfidf(self.wordDF, self.numDocs, self.meanDocLen)
     similarity = 0.0
     for word in trainDoc.wordsMap:
         if queryTermsMap.has_key(word):
             similarity += tfIdfHelper.getTfIdf(queryTermsMap[word],word, trainDoc)
     return similarity
 def set_up_table(self, tokens):
     table = tfidf.tfidf()
     i = 0
     for token in tokens:
         table.addDocument(self.ids[i],token)
         i = i+1
     return table
Example #8
0
def build_tfidf(foreignBrand):
    datas = []
    rs = []
    merge_to_series = False
    (brand, vendor, series) = foreignBrand.split(u"#")
    #print brand.encode("utf-8") + ":" + maker.encode("utf-8")

    res = external_spec_col.find({
        'site': sys.argv[1],
        'brand.name': brand,
        "vendor.name": vendor,
        "series.name": series
    })
    if res.count() == 1:
        r = res[0]
        if r['model'].get('key', '') == '0':
            merge_to_series = True
            data = [
                r['model']['id'], r['brand']['name'], r['vendor']['name'],
                r['series']['name'], r['model']['name'],
                r['model'].get('year', '') or ''
            ]
            datas.append(data)
    if merge_to_series:
        data = datas[0]
        k = "#".join(data)
        return k

    for r in res:
        data = [
            r['model']['id'], r['brand']['name'], r['vendor']['name'],
            r['series']['name'], r['model']['name'],
            r['model'].get('year', '') or ''
        ]
        datas.append(data)
        rs.append(r)

    #print "------------------------------"

    engine_dict = {}
    if len(rs) == 0:
        print "empty engine map " + foreignBrand
        #return {}
    for i in xrange(len(rs)):
        e = getExternalEngine(rs[i])

        if engine_dict.has_key(e):
            tf = engine_dict[e]
        else:
            tf = tfidf.tfidf()
            engine_dict[e] = tf
        data = datas[i]
        k = "#".join(data)
        n = "#".join(data[1:])
        c = word_bag(n)
        tf.addDocument(k, c)
    #if engine_dict == {}:
    #print len(rs)
    #print len(datas)
    return engine_dict
def preprocess():
    #dirname = "Genres"
    dirname = "Data/Train"
    #dirname = str(sys.argv[1])
    #dirname = "C:/Users/Megha/Documents/studies/544/HW1/project/SpamorHam/train"
    global labels, x, sq, idf
    for dirpath, dirs, files in os.walk(dirname):
        for filename in files:
            if filename[filename.rfind(".") + 1:] == "txt":
                i = filename
                x[i] = {}
                fname = os.path.join(dirpath, filename)
                f = open(fname, "r", encoding="latin1")
                text = f.read()
                f.close()
                tokens = text.split()
                for token in tokens:
                    token = strip_nonalnum_re(token)
                    if token.lower() in stopwords:
                        continue
                    if token not in x[i]:
                        x[i][token] = 1
                    else:
                        x[i][token] += 1
    x, idf, sq = tfidf.tfidf(x)
 def set_up_table2(self, ids):
     table = tfidf.tfidf()
    # print ids
     for id in ids:
         token = self.dict[id]
         table.addDocument(id,token)
     return table
Example #11
0
def process_files(in_dir):
	
	global WORDS, TRAIN
	
	WORDS = in_dir+'words'
	TRAIN = in_dir+'train.dat'
	
	CORRECT_LIST = []
	matches = []
	#CORRECT_LIST = (open(in_dir+'train_corr','r')).readlines()
	for line in (open(in_dir+'train_corr','r')).readlines():
		if line.replace('\n','') != '':
			CORRECT_LIST.append(line.replace('\n',''))
			matches.append(line.replace('\n',''))
	#print CORRECT_LIST
	for line in open(in_dir+'train_wrong','r').readlines():
                if line.replace('\n','') != '':
			matches.append(line.replace('\n',''))
	#print matches
	#matches = (open(in_dir+'train_corr','r')).readlines() + (open(in_dir+'train_wrong','r')).readlines()
	words_per_file = dict()
		
	words = set()
	length = str(len(matches))
	count = 0
	user_counter = 0 
	# first iteration through matches to get all words
	for match in matches:
		count +=1
		print str(count) + " of " + length + " training example preprocessing done"
		t = codeParser(match)
		#if 'tweet' in get_all_words(match) :
		#	user_counter += 1
		#	print user_counter
		words_per_file[match] = t.get_compressed()
		words = words.union(words_per_file[match])
	
	print "preliminary processing done"
	train_f = open(TRAIN, 'w')

	# second iteration through matches to get all word counts
	count = 0
	calc_freq = tfidf(words, words_per_file)
	print "Done calculating idfs"	
	#print "PARAM " + str(calc_freq.getIDF('@param'))
	for match in matches:
		count +=1
		print str(count) + " of " + length + " training examples done"
	
		if match in CORRECT_LIST:
			#print calc_freq.term_freq('tweet',words_per_file[match])
			train_f.write('+1')
			#print len(words_per_file[match])
		else:
			train_f.write('-1')
		
		train_f.write(str(get_word_counts(words_per_file[match], words, calc_freq)))
		train_f.write('\n')
	train_f.close()	
	print "Results printed to file: " + str(TRAIN)
def analyzeBlogs(blogList): # Analyze blog with tfidf, and other word analysis. 
    outputWordsArr  = []
    namesCount, religionCount, weaponryCount, governmentCount, wordCount = 0, 0, 0, 0, 0
    for i, blog in enumerate(blogList):
        scores = {}
        wordCount = 0
        print("Top words in document {}".format(i + 1))
        for word in blog.words:
            flag = True
            word = word.lower() # Everything is in lowercase. 
            for punc in terms.punctuation():
                if punc in word:
                    flag = False
            wordCount+=1
            if flag:  
                scores[word] = tfidf.tfidf(word, blog, blogList) # run tfidf
                if word in terms.governmentTerms(): # increment count based on content to find word densities. 
                    governmentCount+=1
                if word in terms.weaponsTerms():
                    weaponryCount+=1
                if word in terms.femaleNames() or word in terms.maleNames():
                    namesCount+=1
                if word in terms.religiousTerms():
                    religionCount+=1
                sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) # sort the words
        for word, score in sorted_words[0:10]:
            print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
            outputWordsArr.append((word, round(score, 10)))
        print("---------------------------------------------------------")
    # Gathering the density scores of each of these defined features, and creating the returning data type
    analysisOutputs = AnalysisObject(namesCount/wordCount,religionCount/wordCount,weaponryCount/wordCount,governmentCount/wordCount,outputWordsArr)
    return analysisOutputs
Example #13
0
def find_parallels():

    sim_matrix = []

    count = 0
    with codecs.open("data/only_shorashim.json", 'rb', encoding='utf8') as fp:
        shorash_obj = json.load(fp)
        all_words = set()
        for k, v in shorash_obj.items():
            if k == u"Obadiah":
                continue
            all_words |= set(reduce_list(reduce_list(v)))

        table = tfidf.tfidf(all_words)
        input_ketaim_into_table(shorash_obj, table, 2)
        for keta_obj, keta_dict in table.documents.items():
            similarities = table.similarities(keta_obj, 5)
            sim_matrix += similarities

            for similarity in similarities:
                if similarity[1] > 0.5:
                    print u"{}\n\t{}\n\t{}".format(similarity[1], keta_obj,
                                                   similarity[0])

        pass
Example #14
0
def main():
    #cleanData()
    #sys.exit()

    # Initialize feature set.
    file = open('feature_set.txt', 'r')
    #file = open('feature_set_2000.txt','r')
    feature_set = file.read().split('\n')
    file.close()

    # Initialize the table that contains all information of replies.
    #file = open('jieba_data2.txt','r')
    file = open('jieba_data_full.txt', 'r')
    lines = file.readlines()
    file.close()
    table = tfidf.tfidf()
    for iter in range(len(lines)):
        reply_info = lines[iter].split('(#SEP#)')
        rate_flag = reply_info[0]
        reply_length = reply_info[1]
        lines_num = reply_info[2]
        dividedWords_list = reply_info[-2].lower().split()
        # eliminate replies of replies.
        if 'quote' not in dividedWords_list:
            table.addDocument(str(iter), rate_flag, reply_length, lines_num,
                              dividedWords_list)
            # print 'lines_num:',lines_num,'reply_length:',reply_length

    #getFeatureSet(table)
    #sys.exit()

    # Get feature set of every sample.
    sample_list = table.documents
    infoPairs_list = []
    for sample_info in sample_list:
        rate_flag = sample_info[1]
        word_list = sample_info[-1].keys()
        reply_length = sample_info[2]
        lines_num = sample_info[3]
        text_features = getTextFeatures(word_list, reply_length, lines_num,
                                        feature_set)
        infoPairs_list.append((text_features, rate_flag))

    # Train a decision tree (or naive bayes) classifier and test it.
    size = int(len(infoPairs_list) * 0.1)
    train_set, test_set = infoPairs_list[size:], infoPairs_list[:size]
    print 'Training classifier ...'
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    #classifier = nltk.DecisionTreeClassifier.train(train_set)
    print 'Classfier has been trained!'

    #print classifier.pseudocode(depth=3)
    print nltk.classify.accuracy(classifier, test_set)

    # Save the classifier above.
    file = open('myNaiveBayesClassifier.pickle', 'wb')
    #file = open('myDecisionTreeClassifier.pickle', 'wb')
    pickle.dump(classifier, file)
    file.close()
    print 'my classifier has been saved.'
Example #15
0
def Three_Ka_thing(notes):
   A = 3
   U = []
   for i in range(len(notes) - (A-1)):
      x = tfidf.tfidf(CombSep(notes[i:i+A]))
      y = {k: v for k, v in sorted(x.items(), key=lambda item: item[1])}
      U.append(list(y.keys())[-5:])
   return U
Example #16
0
def main():

   table = tfidf.tfidf()
   table.addDocument("foo", ["alpha", "bravo", "charlie", "delta", "echo", "foxtrot", "golf", "hotel", "alpha"])
   table.addDocument("bar", ["alpha", "bravo", "charlie", "india", "juliet", "kilo"])
   table.addDocument("baz", ["kilo", "lima", "mike", "november"])

   print table.similarities (["alpha", "bravo", "charlie"]) # => [['foo', 0.6875], ['bar', 0.75], ['baz', 0.0]]
Example #17
0
 def getDocumentSimilarity(self, queryTermsMap, trainDoc):
     tfIdfHelper = tfidf(self.wordDF, self.numDocs, self.meanDocLen)
     similarity = 0.0
     for word in trainDoc.wordsMap:
         if queryTermsMap.has_key(word):
             similarity += tfIdfHelper.getTfIdf(queryTermsMap[word], word,
                                                trainDoc)
     return similarity
Example #18
0
def get_results(query):
    tf = tfidf()
    processed_query = tf.pre_processing(query)
    relevant_docs = tf.get_relevant_docs(processed_query)
    query_vector, idf_vector, tf_vector = tf.build_query_vector(processed_query)
    sorted_score_list, tf_new, idf_new, tf_idf_new = tf.similarity(relevant_docs, query_vector, idf_vector, tf_vector)
    search_result = tf.get_movie_info(sorted_score_list, tf_new, idf_new, tf_idf_new)
    return search_result, processed_query
Example #19
0
def main():
  #cleanData()
  #sys.exit()
  
  # Initialize feature set.
  file = open('feature_set.txt','r')
  #file = open('feature_set_2000.txt','r')
  feature_set = file.read().split('\n')
  file.close()
  
  # Initialize the table that contains all information of replies.
  #file = open('jieba_data2.txt','r')
  file = open('jieba_data_full.txt','r')
  lines = file.readlines()
  file.close()
  table = tfidf.tfidf()
  for iter in range(len(lines)):
    reply_info = lines[iter].split('(#SEP#)')
    rate_flag = reply_info[0]
    reply_length = reply_info[1]
    lines_num = reply_info[2]
    dividedWords_list = reply_info[-2].lower().split()
    # eliminate replies of replies.
    if 'quote' not in dividedWords_list:
      table.addDocument(str(iter), rate_flag, reply_length, lines_num, dividedWords_list)
      # print 'lines_num:',lines_num,'reply_length:',reply_length

  #getFeatureSet(table)
  #sys.exit()
      
  # Get feature set of every sample.
  sample_list = table.documents
  infoPairs_list = []
  for sample_info in sample_list:
    rate_flag = sample_info[1]
    word_list = sample_info[-1].keys()
    reply_length = sample_info[2]
    lines_num = sample_info[3]
    text_features = getTextFeatures(word_list,reply_length,lines_num,feature_set)
    infoPairs_list.append( (text_features, rate_flag) )
  
  # Train a decision tree (or naive bayes) classifier and test it.
  size = int(len(infoPairs_list) * 0.1)
  train_set, test_set = infoPairs_list[size:], infoPairs_list[:size]
  print 'Training classifier ...'
  classifier = nltk.NaiveBayesClassifier.train(train_set)
  #classifier = nltk.DecisionTreeClassifier.train(train_set)
  print 'Classfier has been trained!'
  
  #print classifier.pseudocode(depth=3)
  print nltk.classify.accuracy(classifier, test_set)
  
  # Save the classifier above.
  file = open('myNaiveBayesClassifier.pickle', 'wb')
  #file = open('myDecisionTreeClassifier.pickle', 'wb')
  pickle.dump(classifier, file)
  file.close()
  print 'my classifier has been saved.'
def get_tfidf_doc(docTitle):
    sortedTitles = tfidf.tfidf(docTitle, nTitles=20)

    documents = list()
    for title in sortedTitles:
        wikisearch = wikipedia.WikipediaPage(title[0])
        wikicontent = wikisearch.links
        documents.append(wikicontent)
    return sortedTitles, documents
Example #21
0
def best_match_asset():

    try:
        query = request.json['query']

        return tfidf(query)

    except:
        return 'Query not submitted'
Example #22
0
def main():
    # k value needs to be changed according to the user
    kmean = kmeans(30)
    start_time = time.time()
    kmean.read_doc()
    kmean.documents = kmean.documents[1:]
    tfidf_v = tfidf(kmean.documents)
    kmean.cluster(tfidf_v.tfidf_docs)
    print("Total time taken:",time.time() - start_time)
 def get_feature_string_by_document(self, _set, document):
     """Returns a string with the right format for features of the specified document. """
     label = self.sets[_set][document]['label']
     line = "{} ".format(label)
     for word in self.sets[_set][document]['words']:
         line += "{}:{} ".format(self.dictionary[word]['id'],
                                 tfidf.tfidf(word, document, self))
     line += "\n"
     return line
def loadTfidf(locations):
    table = tfidf.tfidf()
    for l in locations:
        print l.split(" ")
        table.addDocument(l, ' '.split(l))
    print '\nPREDICT:'
    for f in table.similarities(['Fillmore']):
        if f[1] > 0:
            print f
Example #25
0
def process_files(in_dir):
        
	words_per_file = dict()
        
	'''for root, dirnames, filenames in os.walk(DIR_NAME):
                for filename in fnmatch.filter(filenames, MATCH):
                        matches.append(os.path.join(root, filename))
	'''
	global WORDS, TRAIN

        WORDS = in_dir+'words'
        TEST = in_dir+'test.dat'

        CORRECT_LIST = []
        matches = []
        #CORRECT_LIST = (open(in_dir+'train_corr','r')).readlines()
        for line in (open(in_dir+'test_corr','r')).readlines():
                if line.replace('\n','') != '':
                        CORRECT_LIST.append(line.replace('\n',''))
                        matches.append(line.replace('\n',''))
        #print CORRECT_LIST
        for line in open(in_dir+'test_wrong','r').readlines():
                if line.replace('\n','') != '':
                        matches.append(line.replace('\n',''))

	words = get_all_words_file()
	length = str(len(matches))
	count = 0
	
	# first iteration through matches to get all words
	for match in matches:
		count +=1
		print str(count) + " of " + length + " test example preprocessing done"	
		words_per_file[match] = get_all_words(match)
	print "preliminary processing done"
	
	train_f = open(TEST, 'w')
	# second iteration through matches to get all word counts
	count = 0
	calc_freq = tfidf(words, words_per_file)
	print "Done calculating idfs"	
	#print calc_freq.getIDF('tweet')
	for match in matches:
		count +=1
		print str(count) + " of " + length + " testing examples done"

		if match in CORRECT_LIST:
			train_f.write('0')
		else:
			train_f.write('0')
		
		train_f.write(str(get_word_counts(words_per_file[match], words, calc_freq)))
		train_f.write('\n')
	
	train_f.close()	
	print "Results printed to file: " + str(TEST)
Example #26
0
def get_related_doc():
  tfidf.initFromFile("tfidf.txt")
  sortedTitles = tfidf.tfidf("United States", nTitles=4)

  documents = list()
  for title in sortedTitles:
    wikisearch = wikipedia.page(title[0]) 
    wikicontent = wikisearch.content
    wikicontent = re.sub(r'[=]+[^=]*[=]+', '', wikicontent)
    documents.append(sent_tokenize(wikicontent))
  return sortedTitles, documents
Example #27
0
File: lib.py Project: wagin/ppeople
def evaluate_keywords(job_posts, evaluated_idf, selection=0.1):
    for j in job_posts:
        tfidfs = {}
        for token in j['tokens']:
            if token in tfidfs:
                continue
            tfidfs[token] = tfidf(token, j['tokens'], evaluated_idf)
        sorted_tfidfs = sorted(tfidfs.items(), key=lambda x: -x[1])
        top_count = int(round(len(sorted_tfidfs)*selection))
        keywords = [i[0] for i in sorted_tfidfs[:top_count]]
        j['keywords'] = keywords
def get_keyword(data: list[Weibo], stopwords=set()) -> list[list[str]]:
    comments_flat: list[list[str]] = map(
        lambda w: reduce(lambda x, y: x+y.words, w.comments, []), data)
    idf = tfidf.idf(comments_flat)
    weibo_keywd = []
    for id, time, total, comments in data:
        all_text = reduce(lambda x, y: x+y.words, comments, [])
        weibo_keywd.append(
            tfidf.tfidf(all_text, idf, stopwords=stopwords))

    return weibo_keywd
Example #29
0
    def __init__(self, urls = []):
        #Intermediate data will being handled here: urls, extracted text, terms, clusters, etc.

        #list of urls and their labels, ranking scores
        #e.g: urls = [["nature.com", 1, 0.9], ["sport.com", 0, 0.01]
        #list of terms and their labels, ranking scores
        #e.g: terms = [["science", 1, 0.9], ["sport", 0, 0.02]]
        self.urls_set = set(urls)
        self.positive_urls_set = set()
        self.negative_urls_set = set()
        self.tfidf = tfidf.tfidf()
        self.memex_home = environ['MEMEX_HOME']
Example #30
0
def make_comparisons(url):
    arts = Article.objects.all()
    if len(arts) == 0:
        content = "Like pages to get started!"
        comparisons = None
    else:
        url_list = []
        for article in arts:
            url_list.append(article.url)
        # Check if we have already liked/disliked it
        if url in url_list:
            if Article.objects.get(url=url).response == 'L':
                content = 'ALREADY LIKED (100%)'
                comparisons = None
            elif Article.objects.get(url=url).response == 'D':
                content = 'ALREADY DISLIKED (100%)'
                comparisons = None
            else:
                raise AttributeError('Somehow we managed to neither like nor dislike this webpage.')
        # What to do if we haven't
        elif url not in url_list:
            bodies_like = ''
            bodies_dislike = ''
            # Grab likes and dislikes
            for article in arts:
                if article.response == 'L':
                    bodies_like += ' '
                    bodies_like += str((unidecode(article.body_text)))
                elif article.response == 'D':
                    bodies_dislike += ' '
                    bodies_dislike += str(unidecode(article.body_text))
                else:
                    raise ValueError('Unknown rating encountered. Contact admin to reset database.')
            bodies_like_words = bodies_like.split()
            bodies_dislike_words = bodies_dislike.split()
            table = tfidf.tfidf()
            table.addDocument('likes', bodies_like_words)
            table.addDocument('dislikes', bodies_dislike_words)
            compare_text = get_html_text(url)
            comparisons = table.similarities(compare_text)
            if comparisons[0][1] > comparisons[1][1]:
                content = 'LIKE ({0}% Certain)'.format(round(100. * comparisons[0][1], 2))
            elif comparisons[0][1] < comparisons[1][1]:
                content = 'DISLIKE ({0}% Certain)'.format(round(100. * comparisons[1][1], 2))
            elif comparisons[0][1] == comparisons[1][1]:
                content = 'NEUTRAL ({0}% Certain)'.format(round(100. * comparisons[0][1], 2))
            else:
                raise ValueError('You should never receive this error. If so please send the admin a message saying so...')
        # 'How did you get here?' Error
        else:
            raise LookupError("This URL both does not exists in the database and does not, not exist in the database."
                              "Might be time to fall into a solipsistic coma and hope Apocalypse Now isn't real.")
    return content, comparisons
Example #31
0
def runSearch(searchWords):
    with open("data/jokes.json") as jokeFile:
        jokeDict = {joke["title"]: joke for joke in json.load(jokeFile)}

        table = tfidf.tfidf()
        for joke in jokeDict.values():
            table.addDocument(joke["title"], extract_word_list(joke["content"]))

        for joke in top_joke_list(jokeDict, table, searchWords):
            print("*** " + joke["title"])
            print(joke["content"])
            print("=======================")
Example #32
0
def search(query, metric):
    query = preprocess(query)

    if metric == 'tfidf':
        result = tfidf(query)
    elif metric == 'bm25':
        result = bm25(query)
    elif metric == 'fasttext':
        result = fasttext(query)
    elif metric == 'elmo':
        result = elmo(query)

    return result
Example #33
0
def load_table():

    cr = csv.reader(open("lines.csv","rb"))
    table = tfidf.tfidf()

    for r in cr:                              
            id = r[0]
            words = (r[1] + " " + r[2]).lower()
            words = words.replace('-', ' ')
            words = words.replace('.', ' ')
            tokens = nltk.word_tokenize(words)
            table.addDocument(id, tokens)

    return table
Example #34
0
def tf_idf(directory, ext):
    table = tfidf.tfidf()
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(ext):
                filename = root + file
                try:
                    print file
                    content = open(filename, 'rb').read()
                    opcodeList = content.split(",")
                    table.addDocument(file, opcodeList)
                except Exception as err:
                    print str(err)
    print table.similarities(["nop"])
Example #35
0
def compute_tfidf_matrix(corpus_dir):
    t = tfidf.tfidf()
    for path, subdirs, files in os.walk(corpus_dir):
        for name in files:
            f = os.path.join(path, name)
            with codecs.open(f, 'rb', 'utf-8') as i:
                tokens = []
                for line in i:
                    # Skip <doc> tags
                    if not regex.match(ur'</?doc', line):
                        l_tokens = regex.split(ur'[^\p{L}]+', line.lower())
                        tokens += [token for token in l_tokens
                                         if token and token not in STOPWORDS]
                t.addDocument(f, tokens)
Example #36
0
def main():

    table = tfidf.tfidf()
    table.addDocument("foo", [
        "alpha", "bravo", "charlie", "delta", "echo", "foxtrot", "golf",
        "hotel", "alpha"
    ])
    table.addDocument("bar",
                      ["alpha", "bravo", "charlie", "india", "juliet", "kilo"])
    table.addDocument("baz", ["kilo", "lima", "mike", "november"])

    print table.similarities(
        ["alpha", "bravo",
         "charlie"])  # => [['foo', 0.6875], ['bar', 0.75], ['baz', 0.0]]
Example #37
0
def compute_tfidf_matrix(corpus_dir):
    t = tfidf.tfidf()
    for path, subdirs, files in os.walk(corpus_dir):
        for name in files:
            f = os.path.join(path, name)
            with codecs.open(f, 'rb', 'utf-8') as i:
                tokens = []
                for line in i:
                    # Skip <doc> tags
                    if not regex.match(ur'</?doc', line):
                        l_tokens = regex.split(ur'[^\p{L}]+', line.lower())
                        tokens += [
                            token for token in l_tokens
                            if token and token not in STOPWORDS
                        ]
                t.addDocument(f, tokens)
def tfidf_per_genre(plot_wc=False):
    data = pd.read_csv(config.dataset_dir + 'final_data.csv')
    genres_file = open(config.dataset_dir + 'unique_genres.txt', 'r')
    genre_list = [genre.strip('\n') for genre in genres_file.readlines()]
    directory = config.dataset_dir + 'output/sentiment_word_texts/'
    book_list = []

    index = tfidf.create_index(directory)

    for genre in genre_list:
        genre = genre.replace('/', ' ')

        score_dict = {}
        book_list = []

        books_of_genre = data.loc[data['genre'] == genre]

        for book in books_of_genre['filename']:
            book_list.append(book)

        try:
            tf_matrix, genre_tokens = tfidf.create_tf_matrix(
                directory, book_list, genre)

            for term in genre_tokens:
                score = tfidf.tfidf(term, genre, directory, index, tf_matrix)
                score_dict[term] = score

            scores_file = open(
                config.dataset_dir + 'output/top200_per_genre/' + genre +
                '.txt', 'w')

            for w in sorted(score_dict, key=score_dict.get, reverse=True):
                scores_file.write('%s/n' % w)

            scores_file.close()

            print('success')

            if plot_wc:
                font_path = config.dataset_dir + 'Open_Sans_Condensed/OpenSansCondensed-Light.ttf'
                create_wordcloud(score_dict, genre)

        except ZeroDivisionError:
            continue
        except ValueError:
            continue
Example #39
0
def cal_similarity(mail):
	tmp_tfidf = tfidf()
	similarity = []
	title_sim = []
	for i in mail.text:
		tmp_tfidf.addDocument(i.index,i.remove_stop_ver)
	for i in mail.text:
		tmp_vector = []
		tmp = tmp_tfidf.similarities(i.remove_stop_ver)
		for j in tmp:
			tmp_vector.append(j[1])		
		similarity.append(tmp_vector)
	#print(similarity)
	tmp_ti = tmp_tfidf.similarities(mail.remove_stop_ver_title)
	for i in tmp_ti:
		title_sim.append(i[1])
	return numpy.asarray(similarity),numpy.asarray(title_sim)
Example #40
0
def process_files():
        words_per_file = dict()
	matches = []
        for root, dirnames, filenames in os.walk(DIR_NAME):
                for filename in fnmatch.filter(filenames, MATCH):
                        matches.append(os.path.join(root, filename))
	words = set()
	length = str(len(matches))
	count = 0
	user_counter = 0 
	# first iteration through matches to get all words
	for match in matches:
		count +=1
		print str(count) + " of " + length + " training example preprocessing done"
		#if 'tweet' in get_all_words(match) :
		#	user_counter += 1
		#	print user_counter
		words_per_file[match] = get_all_words(match)
		words = words.union(words_per_file[match])
	
	print "preliminary processing done"
	train_f = open(TRAIN, 'w')

	# second iteration through matches to get all word counts
	pattern = MATCH.replace('*','')
	count = 0
	calc_freq = tfidf(words, words_per_file)
	print "Done calculating idfs"	
	#print calc_freq.getIDF('tweet')
	for match in matches:
		count +=1
		print str(count) + " of " + length + " training examples done"
	
		if CORRECT_PATTERN in match:
		#if match in CORRECT:
		#if match.replace(DIR_NAME+'/'+pattern,'') in CORRECT:
			#print calc_freq.term_freq('tweet',words_per_file[match])
			train_f.write('+1')
			#print len(words_per_file[match])
		else:
			train_f.write('-1')
		
		train_f.write(str(get_word_counts(words_per_file[match], words, calc_freq)))
		train_f.write('\n')
	train_f.close()	
	print "Results printed to file: " + str(TRAIN)
    def makeVector(self, wordString, weighting):
        """ @pre: unique(vectorIndex) """

        #Initialise vector with 0's
        vector = [0] * len(self.vectorKeywordIndex)
        wordList = self.parser.tokenise(wordString)
        wordList = self.parser.removeStopWords(wordList)        
        documentString = " ".join(wordList)
        blob = tb(documentString)
        
        ### tf weighting
        for word in wordList:
            if weighting == 'tf':
                vector[self.vectorKeywordIndex[word]] += 1 / len(wordList)  #Use simple Term Count Model
                # vector[self.vectorKeywordIndex[word]] = tfidf.tf(word, blob)
            elif weighting == 'tfidf':
                vector[self.vectorKeywordIndex[word]] = tfidf.tfidf(word, blob, self.blobList)
        return vector
Example #42
0
def generate_weights(documents, lexicon):

    weights = dict()
    m = tfidf()
    print('Adding documents for TF-IDF...')
    for i, document in enumerate(documents):
        m.addDocument(i,
                      document['words']['title'] + document['words']['body'])
        weights[i] = dict()

    print('Generating weight scores for words; This WILL take time...')
    for word in lexicon['title'] & lexicon['body']:
        m.get_similarities(word, weights, 'smooth', 1.25)
    for word in lexicon['title'] - lexicon['body']:
        m.get_similarities(word, weights, 'smooth', 1.1)
    for word in lexicon['body'] - lexicon['title']:
        m.get_similarities(word, weights, 'smooth')
    return weights
Example #43
0
    def __call__(self, docname, values):
        terms = []
        fd = nltk.probability.FreqDist()

        for (term, (inTitle, position), n, N, d) in values:
            #relativePos = float(position)/m
            term_str = ' '.join(term)

            if inTitle:
                terms.append(term_str)
            else:
                score = tfidf.tfidf(n, N, d, self.doccount)
                #score *= relative_pos
                fd.inc(term_str, score)

        # top upper_fraction of terms
        n = int(self.upper_fraction * len(fd))
        terms += fd.keys()[:n]
        yield docname, separator.join(terms)
def jump_to_relevant_paragraph(question, html):
    paragraphs = []
    table = tfidf.tfidf()
    input_soup = bs4.BeautifulSoup(html)
    
    for index, paragraph in enumerate(input_soup.find_all("p")):
        paragraphs.append(paragraph.text)
        words = extract_words(paragraph.text)
        table.addDocument(index, words)
    
    question_words = extract_words(question)
    most_relevant_paragraph_index = max(table.similarities(question_words),
            key=lambda (paragraph_index, score):score)[0]

    output = []
    for paragraph in paragraphs[most_relevant_paragraph_index:]:
        output.append("".join(["<p>", paragraph, "</p>"]))

    return "".join(output)
Example #45
0
 def compareDocToCenter(self, document, wordDFMap, numDocs, meanDocLen):
     if document.centerScore > 0.0:
         return
     if self.numCenterWords > self.queryLength:
         self.numCenterWords = self.queryLength
         tfIdf = tfidf(wordDFMap, numDocs, meanDocLen)
         document.centerScore = 0.0
         i = 0
         wordsUsed = 0
         while wordsUsed < self.numCenterWords:
             word = self.sortedTF[i][0]
             count = self.sortedTF[i][1]
             i += 1
             if tfIdf.isStopWord(word):
                 continue
             wordsUsed += 1
             if document.wordsMap.has_key(word):
                 document.centerScore += tfIdf.getTfIdf(count, word, document)
         self.combinedCenterSim += document.centerScore
Example #46
0
def bert_answers(question, no_answers):
    # Insert question - tf-idf operation performed first and top n bert answers will be returned
    pars = tfidf(question, part_noctable)

    # Select TF-IDF results above threshold.
    top_results = pars.loc[pars['Match Percentage'] >= 10]
    top_results = top_results['Paragraph']

    # Crop results based on those specified in no_answers, unless top_results in less than number specified
    if len(top_results) > no_answers:
        crop_results = top_results.head(no_answers)
    else:
        crop_results = top_results

    #print(crop_results)
    for index, row in crop_results.iteritems():
        print("Bert Answer {}".format(index + 1))
        print(row)
        print(bert(question, row))
Example #47
0
 def store_tfidf(self, title, origin, url, text, amount, lang, publish_tsd):
     self.origin = origin
     if len(text) > 0:
         sentences = tfidf(text, amount, lang)
         if len(sentences) == amount:
             article = models.Article(title=title,
                                      origin=origin,
                                      insert_tsd=timezone.now(),
                                      publish_tsd=publish_tsd,
                                      original_url=url,
                                      bp1=sentences[0],
                                      bp2=sentences[1],
                                      bp3=sentences[2],
                                      bp4=sentences[3],
                                      bp5=sentences[4])
             article.save()
     self.processed_articles += 1
     self.logger.info(
         f'{self.processed_articles:03d} / {self.max_articles:03d}')
Example #48
0
 def __call__(self, docname, values):
     terms = []
     fd = nltk.probability.FreqDist()
     
     for (term, (inTitle, position), n, N, d) in values:
         #relativePos = float(position)/m
         term_str = ' '.join(term)
     
         if inTitle:
             terms.append(term_str)
         else:
             score = tfidf.tfidf(n, N, d, self.doccount)
             #score *= relative_pos
             fd.inc(term_str, score)
     
     # top upper_fraction of terms
     n = int(self.upper_fraction * len(fd))
     terms += fd.keys()[:n]
     yield docname, separator.join(terms)
Example #49
0
def build_tfidf(foreignBrand):
    datas = []
    rs = []
    merge_to_series = False
    (brand, vendor, series) = foreignBrand.split(u"#")
    #print brand.encode("utf-8") + ":" + maker.encode("utf-8")

    res = external_spec_col.find({
        'site': sys.argv[1],
        'brand.name': brand,
        "vendor.name": vendor,
        "series.name": series
    })
    for r in res:
        if r['model']['key'] == '0':
            merge_to_series = True
        data = [
            r['model']['id'], r['brand']['name'], r['vendor']['name'],
            r['series']['name'], r['model']['name'],
            r['model'].get('year', '') or ''
        ]
        datas.append(data)
        rs.append(r)
    if merge_to_series:
        data = datas[0]
        k = "#".join(data)
        return k

    #engine_dict = {}
    #for i in xrange(len(rs)):
    #    e = getExternalEngine(rs[i])
    #    if len(datas) == 0:
    #        #print foreignBrand
    #        return None

    tf = tfidf.tfidf()
    for data in datas:
        k = "#".join(data)
        n = "#".join(data[1:])
        c = word_bag(n)
        tf.addDocument(k, c)
    return tf
Example #50
0
    def submit_selected_urls(self, positive, negative):
    #Perform ranking and diversifing on all urls with regard to the positive urls
    #
    #Args:
    #   labeled_urls: a list of pair <url, label>. Label 1 means positive and 0 means negative.
    #Returns:
    #   urls: list of urls with ranking scores

        # Test new positive and negative examples with exisitng classifier
        # If accuracy above threshold classify pages
        # Ranking 
        # Diversification

        documents = {}
        other = []
        
        all_docs = get_bag_of_words(list(self.urls_set))

        for url in positive:
            if url in all_docs:
                self.positive_urls_set.add(url)
                self.negative_urls_set.discard(url)

        for url in negative:
            if url in all_docs:
                self.negative_urls_set.add(url)
                self.positive_urls_set.discard(url)

        for url in all_docs.keys():
            content = all_docs[url]
            if (len(self.negative_urls_set) == 0) or (url not in self.negative_urls_set):
                documents[url] = content
                if url not in self.positive_urls_set:
                    other.append(url)

        self.tfidf = tfidf.tfidf(documents)

        chdir(self.memex_home + '/seed_crawler/ranking')
        ranker = rank.rank()
        
        [ranked_urls,scores] = ranker.results(self.tfidf,self.positive_urls_set, other)
        return [ranked_urls, scores] # classified, ranked, diversified 
Example #51
0
 def compareDocToCenter(self, document, wordDFMap, numDocs, meanDocLen):
     if document.centerScore > 0.0:
         return
     if self.numCenterWords > self.queryLength:
         self.numCenterWords = self.queryLength
         tfIdf = tfidf(wordDFMap, numDocs, meanDocLen)
         document.centerScore = 0.0
         i = 0
         wordsUsed = 0
         while wordsUsed < self.numCenterWords:
             word = self.sortedTF[i][0]
             count = self.sortedTF[i][1]
             i += 1
             if tfIdf.isStopWord(word):
                 continue
             wordsUsed += 1
             if document.wordsMap.has_key(word):
                 document.centerScore += tfIdf.getTfIdf(
                     count, word, document)
         self.combinedCenterSim += document.centerScore
Example #52
0
 def set_up_table(self, k):
     table = tfidf.tfidf()
     size = len(self.clusters['tier'+str(k)])
     i = 0
     keys = self.clusters['tier'+str(k)].keys()
     self.new_pics = {}
     while i < size:
         pics =  self.clusters['tier'+str(k)][keys[i]]
         id = ''
         tokens = []
         j = 0
         while j < len(pics):
             id = id  + pics[j]
             tokens = tokens + self.pics[pics[j]]['tokens']
             j = j + 1
         self.new_pics[id] = tokens
         i = i +1
     for id in self.new_pics.keys(): 
         table.addDocument(id,self.new_pics[id])
     return table
def generate_weights(documents, lexicon):
    """ function: generate_weights
        --------------------------
        perform tf-idf to generate importance scores for words in documents

        :param document: list of documents to use in calculations
        :returns: dictionary of dictionaries: {"id_" : {"word" : score,...}}
    """
    # weight = { 'document' : { 'word' : score,... },... }
    weights = dict()
    m = tfidf()
    print('Adding documents for TF-IDF...')
    for i, document in enumerate(documents):
        m.addDocument(i, document['words']['title']+document['words']['body'])
        weights[i] = dict()
    # generate dictionary of { "word", "score" } pairs for each document
    print('Generating weight scores for words; This WILL take time...')
    for word in lexicon['title'] | lexicon['body']:
        # UNCOMMENT FOR SANITY
        # print('Generating weights for word:', word)
        m.get_similarities(word, weights)
    return weights
Example #54
0
    with open(f, "r") as handle:
        data = json.loads(handle.read())
        for j in data:
            cmd = j["name"]
            desc = j["description"]
            flags = { (cmd, flag["name"]): flag["description"] for flag in j["optionDesc"] }
            cmd_docs[cmd] = tokenize(desc)
            for k, desc in flags.items():
                descs[k] = desc
                toks = tokenize(desc)
                cmd_docs[cmd] += toks
                flag_docs[k] = toks

print("Indexing...")
import tfidf
engine = tfidf.tfidf()
flag_engines = {}
for cmd, toks in cmd_docs.items():
    engine.addDocument(cmd, toks)
    flag_engines[cmd] = tfidf.tfidf()
    for (cmdd, flag), flag_toks in flag_docs.items():
        if cmdd == cmd:
            flag_engines[cmd].addDocument(flag, flag_toks)

print("Running...")
while True:
    try:
        inp = raw_input("> ")
    except EOFError:
        break
    toks = tokenize(inp)
Example #55
0
def process_files(in_dir):
        
	words_per_file = dict()
        t_words_per_file = dict()
	'''for root, dirnames, filenames in os.walk(DIR_NAME):
                for filename in fnmatch.filter(filenames, MATCH):
                        matches.append(os.path.join(root, filename))
	'''
	global WORDS, TRAIN

        WORDS = in_dir+'words'
        TEST = in_dir+'test.dat'
	
	## SHOULD TFIDF be calc adding in test_data?
	TRAIN_SET = []
	for line in (open(in_dir+'train_corr','r')).readlines():
        	if line.replace('\n','') != '':
        	        TRAIN_SET.append(line.replace('\n',''))
	for line in (open(in_dir+'train_wrong','r')).readlines():
         	if line.replace('\n','') != '':
        	        TRAIN_SET.append(line.replace('\n',''))

	for t_match in TRAIN_SET:
                t_words_per_file[t_match] = get_all_words(t_match)
	

        CORRECT_LIST = []
        matches = []
        for line in (open(in_dir+'test_corr','r')).readlines():
                if line.replace('\n','') != '':
                        CORRECT_LIST.append(line.replace('\n',''))
                        matches.append(line.replace('\n',''))
        for line in open(in_dir+'test_wrong','r').readlines():
                if line.replace('\n','') != '':
                        matches.append(line.replace('\n',''))

	words = get_all_words_file()
	length = str(len(matches))
	count = 0
	
	# first iteration through matches to get all words
	for match in matches:
		count +=1
		print str(count) + " of " + length + " test example preprocessing done"	
		words_per_file[match] = get_all_words(match)
	print "preliminary processing done"	
	train_f = open(TEST, 'w')
	# second iteration through matches to get all word counts
	count = 0
	# TODO: should this be t_words_per_file, words_per_file or the combination???
	calc_freq = tfidf(words, t_words_per_file)
	print "Done calculating idfs"	
	#print "PARAM " + str(calc_freq.getIDF('@param'))
	for match in matches:
		count +=1
		print str(count) + " of " + length + " testing examples done"

		if match in CORRECT_LIST:
			train_f.write('+1')
		else:
			train_f.write('-1')
		
		train_f.write(str(get_word_counts(words_per_file[match], words, calc_freq)))
		train_f.write('\n')
	
	train_f.close()	
	print "Results printed to file: " + str(TEST)
Example #56
0
def pre_proc_code(run_pre_proc):
	# grab all corpus files
	# pre-write the tfidfs to file so can be read form file

	if not run_pre_proc:

		matches = []
		words_per_file = dict()
		for root, dirnames, filenames in os.walk(CORPUS_1):
         	       for filename in fnmatch.filter(filenames, "*.java_*"):
                	        matches.append(os.path.join(root, filename))
		for root, dirnames, filenames in os.walk(CORPUS_2):
                       for filename in fnmatch.filter(filenames, "*.java_*"):
                                matches.append(os.path.join(root, filename))

		words = set()
		for match in matches:
			print match
			words_per_file[match] = get_all_words(match)
			words = words.union(words_per_file[match])			
		'''	
		print "Writing words per file to file tmp/words_per_file.csv'"
                f_words_pf = open(TMP_DIR+'words_per_file_tmp.csv', 'w')
                for w in words_per_file:
                        f_words_pf.write(w + ','  + str(words_per_file[w]) + '\n')
                f_words_pf.close()
                print "Writing all words to file tmp/words_tmp.csv'"
                f_words = open(TMP_DIR+'words_tmp.csv', 'w')
                for word in words:
                        f_words.write(word + '\n')                    
                f_words.close()
		'''
		print "calculuating idfs"
		calc_freq = tfidf(words, words_per_file)
		#print "actually calculating idfs"
		idfs = calc_freq.getAllIdfs()	
		
		print "Writing idf values to file tmp/idfs_tmp.csv'"
		f_idfs = open(TMP_DIR+'idfs_tmp.csv', 'w')
		for key in idfs:
			if key != '':
				f_idfs.write(key + ',' + str(idfs[key]) + '\n')			
		f_idfs.close()
		
		#caluclate feature vectors
		feature_vectors_dict = dict()
		f_feature_vecs = open(TMP_DIR+'/feature_vecs_tmp.csv','w')
		for match in matches:
			wrd_cnts = str(get_word_counts(words_per_file[match], words, calc_freq))
                	f_feature_vecs.write(match+','+ wrd_cnts +'\n')
			feature_vectors_dict[match] = wrd_cnts   
		
	else:
		print "Reading values from files in tmp/ directory"
		words = set()
		'''
		f_words = open(TMP_DIR+'words_tmp.csv', 'r')
        0%        for word in f_words.readlines():
                	words.add(word.replace('\n',''))        
		f_words.close()
		'''
		idfs = dict()
                f_idfs = open(TMP_DIR+'idfs_tmp.csv', 'r')
                for line in f_idfs.readlines():
			comma = line.split(',')
			idfs[comma[0]] = float((comma[1]).replace('\n',''))
                        #print comma[0] + ',' + comma[1].replace('\n','')
			#f_idfs.write(key + ',' + str(idfs[key]) + '\n')
                f_idfs.close()
	              
                feature_vectors_dict = dict()
                with open(TMP_DIR+'feature_vecs_tmp.csv', 'r') as csvfile:
                        reader = csv.reader(csvfile)
                        for row in reader:
                                feature_vectors_dict[row[0]] = (row[1]).replace('\n','')
                        	#print (row[0] + ','  + row[1])
                csvfile.close()
        

		'''
		words_per_file = dict()
		with open(TMP_DIR+'words_per_file_tmp.csv', 'r') as csvfile:
                	reader = csv.reader(csvfile)
			for row in spamreader:
				words_per_file[row[0]] = eval(row[1])
                        #f_words_pf.write(w + ','  + str(words_per_file[w]) + '\n')
                csvfile.close()
		'''
		# read featute vectors from file
	return feature_vectors_dict, idfs
from bs4 import BeautifulSoup
soup = BeautifulSoup(open('cwec_v2.8.xml'), 'xml')

from nltk.stem import WordNetLemmatizer
st = WordNetLemmatizer()
stemmer = SnowballStemmer("english")
print('*'*50)
i = 0
z = 0
stopwords = nltk.corpus.stopwords.words('english')

g = open('Data2/words.txt', 'w+')
g.close()

for x in soup.Weakness_Catalog.Weaknesses.find_all('Weakness'):
    table = tfidf.tfidf()
    name = x.attrs['Name']
    #Get description summary and format text.
    s = x.Description.Description_Summary.get_text()
    s = s.lower()
    s = "".join(c for c in s if c not in string.punctuation)
    a = s.split()
    a = [w for w in a if w.lower() not in stopwords]
    a = [stemmer.stem(w) for w in a]
    c = Counter(a)
    #See if there is an extended description before grabbing it.
    if hasattr(x.Description, 'Extended_Description'):
        if x.Description.Extended_Description is None:
            z += 1
            p = ""
        else:
Example #58
0
def rocchio():
    all_documents, term_dictionary, doc_lengths = textmanip.get_documents()
    all_queries = textmanip.get_queries()
    avg_doc_len = numpy.average(doc_lengths)

    #calculate the tfidf weigthed sum for each document query pair
    query_doc_similar_list = tfidf.tfidf(term_dictionary, all_queries, all_documents, doc_lengths)

    #1. Get top relevant docs
    most_rel_docs = list()
    for q, query in enumerate(all_queries):					
        if len(all_documents)<10:
            rel_docs = 2 if len(all_documents)>=2 else len(all_documents)
        most_rel_docs.append((sorted(query_doc_similar_list[q]))[:rel_docs], reverse=True) 
        
    #2. Calculate all the TFIDF of the words in the relevant all_documents
    #	Add the values together: for every word, add up the weight given to it by each document
    #	Get the 10 most important words
    #   Add those words to the query
    #   Run TFIDF again with new query
    query_weight_list = list()				
    for docList in most_rel_docs:			
        doc_weights = list()				
        for doc in docList:				
            currentDocDict = all_documents[doc]		
            word_weights = dict()				
            for word in term_dictionary:				
                word_weights[word] = textmanip.tfidfWordWeighting(word, doc, term_dictionary, all_documents, doc_lengths, avg_doc_len) 
            doc_weights.append(word_weights)
        query_weight_list.append(doc_weights)		

    #average out the values obtained for the weights accross all docs (per query) -> you get one weight list per query
    avg_query_w_list = list()			
    for i in range(len(all_queries)):				
        averageWordWeights = dict()				
        for word in term_dictionary:		
            addedWeight = 0
            for j in range(len(most_rel_docs[i])):		
                weightDict = query_weight_list[i][j]
                addedWeight += weightDict[word]		
            averageWordWeights[word] = addedWeight/(len(most_rel_docs[i])*1.00) 
        avg_query_w_list.append(averageWordWeights)		

    rocchio_q = copy.deepcopy(all_queries)
    for i in range(len(rocchio_q)):
        #get the max weighted words in the avg_query_w_list[i] dictionary
        sortedListOfWords = sorted(avg_query_w_list[i].items(), key=operator.itemgetter(1), reverse=True)
        
        wordsInTheRelevantDocs = list()
        for j in range(len(sortedListOfWords)):
            wordsInTheRelevantDocs.append(sortedListOfWords[j][0])
            for word in rocchio_q[i]:
                if word not in wordsInTheRelevantDocs:
                    rocchio_q[i][word] = (A*(rocchio_q[i][word]))
                
        for j in range(len(sortedListOfWords)):
            wordToAdd = sortedListOfWords[j][0]
            weightToAdd = sortedListOfWords[j][1]

            if wordToAdd in rocchio_q[i]:
                rocchio_q[i][wordToAdd] = (A*(rocchio_q[i][wordToAdd]))+(B*(weightToAdd))
            else:
                rocchio_q[i][wordToAdd] = (B*(weightToAdd))

    queryDocSimilaryList = tfidf.tfidf(term_dictionary, rocchio_q, all_documents, doc_lengths)
    textmanip.outputResults(queryDocSimilaryList, 'best.top')