Example #1
0
def indexDocument(inString, schemeDocs, schemeQuery, invIndex):
    # check scheme
    if schemeDocs == 'tfidf' or schemeDocs == 'tfc':
        # Preprocess input string into list of tokens
        tokenList = preprocess.stemWords((preprocess.removeStopwords(
            (preprocess.tokenizeText(preprocess.removeSGML(inString))))))

        # get document number and increment doc-count
        docNum = invIndex['doc-count']
        invIndex['doc-count'] += 1

        # build temporary dictionary of term frequencies for this document
        # wordDict { 'word': tf }
        wordDict = {}
        for word in tokenList:
            if word in wordDict:
                wordDict[word] += 1.0
            else:
                wordDict[word] = 1.0

        # add entries to invIndex for each word
        # increments document frequency where necessary
        for word, tf in wordDict.iteritems():
            if word in invIndex:
                invIndex[word][docList].append([docNum, tf])
                invIndex[word][df] += 1.0
            else:
                invIndex[word] = {df: 1.0, docList: [[docNum, tf]]}
        return invIndex
    else:
        sys.exit("Document weighting scheme '" + schemeDocs +
                 "' is not acceptable input. Try 'tfidf' or 'tcf'.")
Example #2
0
def processText(text, stopwords = False, stem = False):
	tokens = removeSGML(text)
	tokens = tokenizeText(tokens)
	if stopwords:
		tokens = removeStopwords(tokens)
	if stem:
		tokens = stemWords(tokens)
	return tokens
Example #3
0
def retrieveDocuments(query, invertedIndex, weightingDoc, weightingQuery):
    # preprocess the query
    que = preprocess.removeSGML(query)
    que = preprocess.tokenizeText(que)
    que = preprocess.removeStopwords(que)
    que = preprocess.stemWords(que)
    del que[0]
    # decide the set of documents each of which contains at least 1 tokens in query
    tfque = {}
    docSet = set()
    for token in que:
        if token not in invertedIndex.keys():
            continue
        if token not in tfque:
            tfque[token] = 0
        tfque[token] += 1
        for pair in invertedIndex[token]:
            docSet.add(pair[0])
    queList = tfque.keys()
    relDoc = {}
    # tfidf.tfidf
    if weightingDoc == "tfidf" and weightingQuery == "tfidf":
        docWeight, queWeight = cal_tfidf(queList, tfque, docSet, invertedIndex)
        for docID in docWeight.keys():
            relDoc[docID] = 0
            for idx, tf in enumerate(docWeight[docID]):
                relDoc[docID] += tf * queWeight[idx]
    #tfidf.bpx
    elif weightingDoc == "tfidf" and weightingQuery == "bpx":
        docWeight, queWeight_f = cal_tfidf(queList, tfque, docSet,
                                           invertedIndex)
        queWeight = cal_bpx(queList, tfque, invertedIndex)
        for docID in docWeight.keys():
            relDoc[docID] = 0
            for idx, tf in docWeight[docID]:
                relDoc[docID] += tf * queWeight[idx]
    #nxx.tfidf
    elif weightingDoc == "nxx" and weightingQuery == "tfidf":
        docWeight_f, queWeight = cal_tfidf(queList, tfque, docSet,
                                           invertedIndex)
        docWeight = cal_nxx(queList, docSet, invertedIndex)
        for docID in docWeight.keys():
            relDoc[docID] = 0
            for idx, tf in enumerate(docWeight[docID]):
                relDoc[docID] += tf * queWeight[idx]
    #nxx.bpx
    elif weightingDoc == "nxx" and weightingQuery == "bpx":
        docWeight = cal_nxx(queList, docSet, invertedIndex)
        queWeight = cal_bpx(queList, tfque, invertedIndex)
        for docID in docWeight.keys():
            relDoc[docID] = 0
            for idx, tf in enumerate(docWeight[docID]):
                relDoc[docID] += tf * queWeight[idx]
    else:
        print "Weighting scheme for doc is [tfidf, nxx], for query is [tfidf, bpx]"
        quit()
    return relDoc
def getAllTokens(senator_tweet_text):
    tokens = preprocess.tokenizeText(senator_tweet_text)
    tokens = preprocess.removeStopwords(tokens)
    # We decided to remove all 1-character words b/c they do not contain meaning
    tokens = [t for t in tokens if len(t) > 1]
    for token in tokens:
        # account for common internet slang
        if token == 'w/':
            token = 'with'
    return tokens
Example #5
0
def alternative_method():

    import preprocess as pr
    pr.initStopWords('stopwords')

    pr.stemWords(
        pr.removeStopwords(pr.tokenizeText(
            open('held_out_tweets.txt').read())))

    dictionary = extract_dictionary('tweets.txt')
    X = extract_feature_vectors('tweets.txt', dictionary)
    y = read_vector_file('labels.txt')
def vectorizeShortDoc(raw_docs, word_vectors, is_refine=False, word_limit=100):
    """
    word vectors for each short doc
    """
    # tokenize
    print("vectorize short docs...")
    docs = []
    for raw_doc in raw_docs:
        docs.append(preprocess.tokenizeText(raw_doc))
    #docs = preprocess.tokenizeText(raw_docs)
    if (is_refine):
        docs = tfidf_helper.extract(docs, word_limit)
    docs_vecs = match_helper.findWordVectors(docs, word_vectors)
    return docs_vecs
def vectorizeLongDoc(raw_docs, word_vectors, topic_num=10, is_refine=False, word_limit=100):
    """
    raw_docs: a list of the concateation of reviewers' works
    vector space for each long doc
    """
    # tokenize
    print("vectorize long docs...")
    docs = []
    for raw_doc in raw_docs:
        docs.append(preprocess.tokenizeText(raw_doc))
    #docs = preprocess.tokenizeText(raw_docs)
    # if refine with tf-idf methods
    if (is_refine):
        docs = tfidf_helper.extract(docs, word_limit)
    docs_topics, topic_weights = match_helper.findHiddenTopics(docs, word_vectors, topic_num)
    return docs_topics, topic_weights
Example #8
0
def indexDocument(document, weightingDoc, weightingQuery, invertedIndex):
    # preprocess the content provided as input
    texts = preprocess.removeSGML(document)
    texts = preprocess.tokenizeText(texts)
    texts = preprocess.removeStopwords(texts)
    texts = preprocess.stemWords(texts)
    # add the tokens to the inverted index provided as input and calculate teh numbers necessary to calculate the weights for the given weighting schemes
    docID = texts[0]
    tf = {}
    for word in texts:
        if word not in tf:
            tf[word] = 0
        tf[word] += 1
    for word in tf.keys():
        if word not in invertedIndex:
            invertedIndex[word] = []
        invertedIndex[word].append((docID, tf[word]))
def createSentenceObjects(sentences):
    sentenceObjects = []
    # get stopwords
    s = open("stopwords", "r")
    stopwords = s.read().split()
    # iterate through the list of sentences
    for index, sentence in enumerate(sentences):
        # Tokenize sentence
        tokens = tokenizeText(sentence)
        # Remove stopwords from sentence tokens
        tokens = removeStopwords(tokens, stopwords)
        # Stem the tokens of the sentence
        stemmed = stemWords(tokens)
        # Remove punctuations
        stemmed = removePunks(stemmed)
        # Create ourSentence object and append to list of ourSentence objects
        sentenceObjects.append(ourSentence(sentence, stemmed, index))
    # Return the list of ourSentence objects
    return sentenceObjects
Example #10
0
def splitIntoSentences(file):
    #print("\n\n")
    inQuote=False
    sentences = []
    beginning=0
    tokenized = tokenizeText(file.replace("”", "\"").replace("“", "\"").replace("‘", "'").replace("’", "'"))
    sentence = ""
    for token in tokenized:
        token=token
        if(isAPunk(token) and not (token=="\"" or token=="'" or token=="”" or token=="“" or token=="‘" or token=="’")):
            sentence+=token
        else:
            sentence+=" "+token

        if((token=="\"" or token=="”" or token=="“") and not inQuote):
            inQuote=True
        elif((token=="\"" or token=="”" or token=="“") and inQuote):
            inQuote=False #
        if(isDelineator(token) and not inQuote):
            if(file[beginning:index+1]!="\n\n" and file[beginning:index+1]!="\n"):
                sentence=sentence.replace("”", "\"")
                sentence=sentence.replace("“", "\"")
                sentence=sentence.replace("‘", "'")
                sentence=sentence.replace("’", "'")
                sentences.append(sentence.lower())
                #print(sentence.lower(),end="\n\n")
                sentence = ""
    '''
    for index, words in enumerate(file):
        if(words=="\"" and not inQuote):
            inQuote=True
        elif(words=="\"" and inQuote):
            inQuote=False #
        if(isDelineator(words) and not inQuote):
            if(file[beginning:index+1]!="\n\n" and file[beginning:index+1]!="\n"):
                sentences.append(file[beginning:index+1].replace("\n", ""))
                beginning=index+1
    '''
            
    return sentences
Example #11
0
def indexDocument(tweet, celeb, invertedIndex, docLengths):
    tokens = pro.tokenizeText(tweet)
    noStops = pro.removeStopwords(tokens)
    #stems = pro.stemWords(tokens)

    if celeb not in docLengths:
        docLengths[celeb] = 0

    for term in noStops:
        docLengths[celeb] += 1
        if term not in invertedIndex:
            invertedIndex[term] = []
            invertedIndex[term].append(1)
            invertedIndex[term].append({})
            invertedIndex[term][1][celeb] = 1
        elif celeb not in invertedIndex[term][1]:
            invertedIndex[term][0] += 1
            invertedIndex[term][1][celeb] = 1
        elif celeb in invertedIndex[term][1]:
            invertedIndex[term][1][celeb] += 1

    return invertedIndex, docLengths
Example #12
0
def retrieveDocuments(query, invIndex, schemeDocs, schemeQuery):
    # Preprocess query into list of tokens
    tokenList = preprocess.stemWords((preprocess.removeStopwords(
        (preprocess.tokenizeText(preprocess.removeSGML(query))))))

    # get query term frequencies
    queryTermFreq = {}
    for word in tokenList:
        # only include words that appear in at least one document
        if word in invIndex:
            if word in queryTermFreq:
                queryTermFreq[word] += 1.0
            else:
                queryTermFreq[word] = 1.0

    # get query length, (query term normalization)
    queryLength = 0.0
    for word in queryTermFreq:
        if word in invIndex:
            queryLength += math.pow(invIndex[word][idf] * queryTermFreq[word],
                                    2)
    queryLength = math.sqrt(queryLength)

    # first scheme set is tfidf.tfidf with no normalization
    if schemeQuery == 'tfidf' and schemeDocs == schemeQuery:
        # create similarity score dictionary -> maps relevant docs to similarity score
        # first step is to create the numerator (dot product), then divide all terms by denominator (normalization)
        # using tfc method for query and document
        simScores = {}
        # iterate over each word
        for word in queryTermFreq:
            # and each document that contains that word
            for docNum, tf in invIndex[word][docList]:
                if docNum in simScores:
                    simScores[docNum] += (queryTermFreq[word] * tf *
                                          math.pow(invIndex[word][idf], 2))
                else:
                    simScores[docNum] = (queryTermFreq[word] * tf *
                                         math.pow(invIndex[word][idf], 2))

        # divide each dot product by normalization factor -- APARENTLY DO NOT DO THIS?!?!?
        # REMOVED --
        # for doc in simScores:
        # 	simScores[doc] = simScores[doc] / (queryLength * docLengths[doc])

        # return the simScore dictionary
        return simScores

        # create simScoresList
        # simScoresList = []
        # for docNum, score in simScores.iteritems():
        # 	simScoresList.append([docNum, score])
        # simScoresList.sort(key=lambda scores: scores[1], reverse=True)

    # second scheme is tfc.nfx
    elif schemeDocs == 'tfc' and schemeQuery == 'nfx':
        # get max term frequency in query
        queryMaxTF = 0
        for word, tf in queryTermFreq.iteritems():
            if tf > queryMaxTF:
                queryMaxTF = tf

        simScores = {}

        # iterate over each word in query and each doc that contains those words
        for word in queryTermFreq:
            for docNum, tf in invIndex[word][docList]:
                if docNum in simScores:
                    simScores[docNum] += (
                        tf * math.pow(invIndex[word][idf], 2) *
                        (0.5 + (0.5 * queryTermFreq[word] / queryMaxTF)))
                else:
                    simScores[docNum] = (
                        tf * math.pow(invIndex[word][idf], 2) *
                        (0.5 + (0.5 * queryTermFreq[word] / queryMaxTF)))

        # normalize using document length (tfc scheme for doc)
        for doc in simScores:
            simScores[doc] = simScores[doc] / docLengths[doc]

        return simScores
Example #13
0
import preprocess as pr
pr.initStopWords('stopwords')
pr.stemWords(
    pr.removeStopwords(pr.tokenizeText(open('held_out_tweets.txt').read())))
def processDoc(doc_string):
    # Preprocess the doc strings
    doc_token = preprocess.tokenizeText(doc_string)
    #doc_token = preprocess.removeStopwords(doc_token)
    #doc_token = preprocess.stemWords(doc_token)
    return doc_token
def processDoc(doc_string):
    token_doc = preprocess.stemWords(preprocess.removeStopwords(preprocess.tokenizeText(doc_string)))
    return token_doc
def splitIntoSentences(file):
    #to determine if we're in a quote and thus shouldn't end a sentence yet
    inQuote = False
    #list to return
    sentences = []
    #store the beginning of the current sentence
    beginning = 0
    #split sentence into tokens
    tokenized = tokenizeText(file)
    #store current sentence
    sentence = ""
    #store last token
    lastToken = ""
    #process each token in document
    for token in tokenized:
        #if the token is a new line and the sentence is empty, just skip this
        if (token == "\n" and sentence == ""):
            lastToken = token
            continue
        elif (token == "\n"):
            #if the token is a new line and the sentence isnt empty, append the sentence and continue
            sentences.append(sentence)
            #print(sentence)
            sentence = ""
            lastToken = token
            continue

        #if the token is a punctuation (except quote marks), just attach it to the end of the current sentence. Otherwise, attach it with a space
        if (isAPunk(token)
                and not (token == "\"" or token == "'" or token == "”"
                         or token == "“" or token == "‘" or token == "’")):
            sentence += token
        else:
            sentence += " " + token

        #if we encounter a quote mark, switch between  inQuote = true or false
        if ((token == "\"" or token == "”" or token == "“") and not inQuote):
            inQuote = True
        elif ((token == "\"" or token == "”" or token == "“") and inQuote):
            inQuote = False  #''
        #if we reach a sentence delineator and we're not in a quote,
        if ((isDelineator(token)) and not inQuote):
            #check if the sentence up till now is just newlines
            if (file[beginning:index + 1] != "\n\n"
                    and file[beginning:index + 1] != "\n"):
                #otherwise, append the new sentence, reset, and continue
                sentences.append(sentence)
                #print(sentence)
                sentence = ""
        #keep track of last Token
        lastToken = token
    '''
    for index, words in enumerate(file):
        if(words == "\"" and not inQuote):
            inQuote = True
        elif(words == "\"" and inQuote):
            inQuote = False #
        if(isDelineator(words) and not inQuote):
            if(file[beginning:index+1] != "\n\n" and file[beginning:index+1]!= "\n"):
                sentences.append(file[beginning:index+1].replace("\n", ""))
                beginning=index+1
    '''

    return sentences