def indexDocument(inString, schemeDocs, schemeQuery, invIndex): # check scheme if schemeDocs == 'tfidf' or schemeDocs == 'tfc': # Preprocess input string into list of tokens tokenList = preprocess.stemWords((preprocess.removeStopwords( (preprocess.tokenizeText(preprocess.removeSGML(inString)))))) # get document number and increment doc-count docNum = invIndex['doc-count'] invIndex['doc-count'] += 1 # build temporary dictionary of term frequencies for this document # wordDict { 'word': tf } wordDict = {} for word in tokenList: if word in wordDict: wordDict[word] += 1.0 else: wordDict[word] = 1.0 # add entries to invIndex for each word # increments document frequency where necessary for word, tf in wordDict.iteritems(): if word in invIndex: invIndex[word][docList].append([docNum, tf]) invIndex[word][df] += 1.0 else: invIndex[word] = {df: 1.0, docList: [[docNum, tf]]} return invIndex else: sys.exit("Document weighting scheme '" + schemeDocs + "' is not acceptable input. Try 'tfidf' or 'tcf'.")
def processText(text, stopwords = False, stem = False): tokens = removeSGML(text) tokens = tokenizeText(tokens) if stopwords: tokens = removeStopwords(tokens) if stem: tokens = stemWords(tokens) return tokens
def retrieveDocuments(query, invertedIndex, weightingDoc, weightingQuery): # preprocess the query que = preprocess.removeSGML(query) que = preprocess.tokenizeText(que) que = preprocess.removeStopwords(que) que = preprocess.stemWords(que) del que[0] # decide the set of documents each of which contains at least 1 tokens in query tfque = {} docSet = set() for token in que: if token not in invertedIndex.keys(): continue if token not in tfque: tfque[token] = 0 tfque[token] += 1 for pair in invertedIndex[token]: docSet.add(pair[0]) queList = tfque.keys() relDoc = {} # tfidf.tfidf if weightingDoc == "tfidf" and weightingQuery == "tfidf": docWeight, queWeight = cal_tfidf(queList, tfque, docSet, invertedIndex) for docID in docWeight.keys(): relDoc[docID] = 0 for idx, tf in enumerate(docWeight[docID]): relDoc[docID] += tf * queWeight[idx] #tfidf.bpx elif weightingDoc == "tfidf" and weightingQuery == "bpx": docWeight, queWeight_f = cal_tfidf(queList, tfque, docSet, invertedIndex) queWeight = cal_bpx(queList, tfque, invertedIndex) for docID in docWeight.keys(): relDoc[docID] = 0 for idx, tf in docWeight[docID]: relDoc[docID] += tf * queWeight[idx] #nxx.tfidf elif weightingDoc == "nxx" and weightingQuery == "tfidf": docWeight_f, queWeight = cal_tfidf(queList, tfque, docSet, invertedIndex) docWeight = cal_nxx(queList, docSet, invertedIndex) for docID in docWeight.keys(): relDoc[docID] = 0 for idx, tf in enumerate(docWeight[docID]): relDoc[docID] += tf * queWeight[idx] #nxx.bpx elif weightingDoc == "nxx" and weightingQuery == "bpx": docWeight = cal_nxx(queList, docSet, invertedIndex) queWeight = cal_bpx(queList, tfque, invertedIndex) for docID in docWeight.keys(): relDoc[docID] = 0 for idx, tf in enumerate(docWeight[docID]): relDoc[docID] += tf * queWeight[idx] else: print "Weighting scheme for doc is [tfidf, nxx], for query is [tfidf, bpx]" quit() return relDoc
def getAllTokens(senator_tweet_text): tokens = preprocess.tokenizeText(senator_tweet_text) tokens = preprocess.removeStopwords(tokens) # We decided to remove all 1-character words b/c they do not contain meaning tokens = [t for t in tokens if len(t) > 1] for token in tokens: # account for common internet slang if token == 'w/': token = 'with' return tokens
def alternative_method(): import preprocess as pr pr.initStopWords('stopwords') pr.stemWords( pr.removeStopwords(pr.tokenizeText( open('held_out_tweets.txt').read()))) dictionary = extract_dictionary('tweets.txt') X = extract_feature_vectors('tweets.txt', dictionary) y = read_vector_file('labels.txt')
def vectorizeShortDoc(raw_docs, word_vectors, is_refine=False, word_limit=100): """ word vectors for each short doc """ # tokenize print("vectorize short docs...") docs = [] for raw_doc in raw_docs: docs.append(preprocess.tokenizeText(raw_doc)) #docs = preprocess.tokenizeText(raw_docs) if (is_refine): docs = tfidf_helper.extract(docs, word_limit) docs_vecs = match_helper.findWordVectors(docs, word_vectors) return docs_vecs
def vectorizeLongDoc(raw_docs, word_vectors, topic_num=10, is_refine=False, word_limit=100): """ raw_docs: a list of the concateation of reviewers' works vector space for each long doc """ # tokenize print("vectorize long docs...") docs = [] for raw_doc in raw_docs: docs.append(preprocess.tokenizeText(raw_doc)) #docs = preprocess.tokenizeText(raw_docs) # if refine with tf-idf methods if (is_refine): docs = tfidf_helper.extract(docs, word_limit) docs_topics, topic_weights = match_helper.findHiddenTopics(docs, word_vectors, topic_num) return docs_topics, topic_weights
def indexDocument(document, weightingDoc, weightingQuery, invertedIndex): # preprocess the content provided as input texts = preprocess.removeSGML(document) texts = preprocess.tokenizeText(texts) texts = preprocess.removeStopwords(texts) texts = preprocess.stemWords(texts) # add the tokens to the inverted index provided as input and calculate teh numbers necessary to calculate the weights for the given weighting schemes docID = texts[0] tf = {} for word in texts: if word not in tf: tf[word] = 0 tf[word] += 1 for word in tf.keys(): if word not in invertedIndex: invertedIndex[word] = [] invertedIndex[word].append((docID, tf[word]))
def createSentenceObjects(sentences): sentenceObjects = [] # get stopwords s = open("stopwords", "r") stopwords = s.read().split() # iterate through the list of sentences for index, sentence in enumerate(sentences): # Tokenize sentence tokens = tokenizeText(sentence) # Remove stopwords from sentence tokens tokens = removeStopwords(tokens, stopwords) # Stem the tokens of the sentence stemmed = stemWords(tokens) # Remove punctuations stemmed = removePunks(stemmed) # Create ourSentence object and append to list of ourSentence objects sentenceObjects.append(ourSentence(sentence, stemmed, index)) # Return the list of ourSentence objects return sentenceObjects
def splitIntoSentences(file): #print("\n\n") inQuote=False sentences = [] beginning=0 tokenized = tokenizeText(file.replace("”", "\"").replace("“", "\"").replace("‘", "'").replace("’", "'")) sentence = "" for token in tokenized: token=token if(isAPunk(token) and not (token=="\"" or token=="'" or token=="”" or token=="“" or token=="‘" or token=="’")): sentence+=token else: sentence+=" "+token if((token=="\"" or token=="”" or token=="“") and not inQuote): inQuote=True elif((token=="\"" or token=="”" or token=="“") and inQuote): inQuote=False # if(isDelineator(token) and not inQuote): if(file[beginning:index+1]!="\n\n" and file[beginning:index+1]!="\n"): sentence=sentence.replace("”", "\"") sentence=sentence.replace("“", "\"") sentence=sentence.replace("‘", "'") sentence=sentence.replace("’", "'") sentences.append(sentence.lower()) #print(sentence.lower(),end="\n\n") sentence = "" ''' for index, words in enumerate(file): if(words=="\"" and not inQuote): inQuote=True elif(words=="\"" and inQuote): inQuote=False # if(isDelineator(words) and not inQuote): if(file[beginning:index+1]!="\n\n" and file[beginning:index+1]!="\n"): sentences.append(file[beginning:index+1].replace("\n", "")) beginning=index+1 ''' return sentences
def indexDocument(tweet, celeb, invertedIndex, docLengths): tokens = pro.tokenizeText(tweet) noStops = pro.removeStopwords(tokens) #stems = pro.stemWords(tokens) if celeb not in docLengths: docLengths[celeb] = 0 for term in noStops: docLengths[celeb] += 1 if term not in invertedIndex: invertedIndex[term] = [] invertedIndex[term].append(1) invertedIndex[term].append({}) invertedIndex[term][1][celeb] = 1 elif celeb not in invertedIndex[term][1]: invertedIndex[term][0] += 1 invertedIndex[term][1][celeb] = 1 elif celeb in invertedIndex[term][1]: invertedIndex[term][1][celeb] += 1 return invertedIndex, docLengths
def retrieveDocuments(query, invIndex, schemeDocs, schemeQuery): # Preprocess query into list of tokens tokenList = preprocess.stemWords((preprocess.removeStopwords( (preprocess.tokenizeText(preprocess.removeSGML(query)))))) # get query term frequencies queryTermFreq = {} for word in tokenList: # only include words that appear in at least one document if word in invIndex: if word in queryTermFreq: queryTermFreq[word] += 1.0 else: queryTermFreq[word] = 1.0 # get query length, (query term normalization) queryLength = 0.0 for word in queryTermFreq: if word in invIndex: queryLength += math.pow(invIndex[word][idf] * queryTermFreq[word], 2) queryLength = math.sqrt(queryLength) # first scheme set is tfidf.tfidf with no normalization if schemeQuery == 'tfidf' and schemeDocs == schemeQuery: # create similarity score dictionary -> maps relevant docs to similarity score # first step is to create the numerator (dot product), then divide all terms by denominator (normalization) # using tfc method for query and document simScores = {} # iterate over each word for word in queryTermFreq: # and each document that contains that word for docNum, tf in invIndex[word][docList]: if docNum in simScores: simScores[docNum] += (queryTermFreq[word] * tf * math.pow(invIndex[word][idf], 2)) else: simScores[docNum] = (queryTermFreq[word] * tf * math.pow(invIndex[word][idf], 2)) # divide each dot product by normalization factor -- APARENTLY DO NOT DO THIS?!?!? # REMOVED -- # for doc in simScores: # simScores[doc] = simScores[doc] / (queryLength * docLengths[doc]) # return the simScore dictionary return simScores # create simScoresList # simScoresList = [] # for docNum, score in simScores.iteritems(): # simScoresList.append([docNum, score]) # simScoresList.sort(key=lambda scores: scores[1], reverse=True) # second scheme is tfc.nfx elif schemeDocs == 'tfc' and schemeQuery == 'nfx': # get max term frequency in query queryMaxTF = 0 for word, tf in queryTermFreq.iteritems(): if tf > queryMaxTF: queryMaxTF = tf simScores = {} # iterate over each word in query and each doc that contains those words for word in queryTermFreq: for docNum, tf in invIndex[word][docList]: if docNum in simScores: simScores[docNum] += ( tf * math.pow(invIndex[word][idf], 2) * (0.5 + (0.5 * queryTermFreq[word] / queryMaxTF))) else: simScores[docNum] = ( tf * math.pow(invIndex[word][idf], 2) * (0.5 + (0.5 * queryTermFreq[word] / queryMaxTF))) # normalize using document length (tfc scheme for doc) for doc in simScores: simScores[doc] = simScores[doc] / docLengths[doc] return simScores
import preprocess as pr pr.initStopWords('stopwords') pr.stemWords( pr.removeStopwords(pr.tokenizeText(open('held_out_tweets.txt').read())))
def processDoc(doc_string): # Preprocess the doc strings doc_token = preprocess.tokenizeText(doc_string) #doc_token = preprocess.removeStopwords(doc_token) #doc_token = preprocess.stemWords(doc_token) return doc_token
def processDoc(doc_string): token_doc = preprocess.stemWords(preprocess.removeStopwords(preprocess.tokenizeText(doc_string))) return token_doc
def splitIntoSentences(file): #to determine if we're in a quote and thus shouldn't end a sentence yet inQuote = False #list to return sentences = [] #store the beginning of the current sentence beginning = 0 #split sentence into tokens tokenized = tokenizeText(file) #store current sentence sentence = "" #store last token lastToken = "" #process each token in document for token in tokenized: #if the token is a new line and the sentence is empty, just skip this if (token == "\n" and sentence == ""): lastToken = token continue elif (token == "\n"): #if the token is a new line and the sentence isnt empty, append the sentence and continue sentences.append(sentence) #print(sentence) sentence = "" lastToken = token continue #if the token is a punctuation (except quote marks), just attach it to the end of the current sentence. Otherwise, attach it with a space if (isAPunk(token) and not (token == "\"" or token == "'" or token == "”" or token == "“" or token == "‘" or token == "’")): sentence += token else: sentence += " " + token #if we encounter a quote mark, switch between inQuote = true or false if ((token == "\"" or token == "”" or token == "“") and not inQuote): inQuote = True elif ((token == "\"" or token == "”" or token == "“") and inQuote): inQuote = False #'' #if we reach a sentence delineator and we're not in a quote, if ((isDelineator(token)) and not inQuote): #check if the sentence up till now is just newlines if (file[beginning:index + 1] != "\n\n" and file[beginning:index + 1] != "\n"): #otherwise, append the new sentence, reset, and continue sentences.append(sentence) #print(sentence) sentence = "" #keep track of last Token lastToken = token ''' for index, words in enumerate(file): if(words == "\"" and not inQuote): inQuote = True elif(words == "\"" and inQuote): inQuote = False # if(isDelineator(words) and not inQuote): if(file[beginning:index+1] != "\n\n" and file[beginning:index+1]!= "\n"): sentences.append(file[beginning:index+1].replace("\n", "")) beginning=index+1 ''' return sentences