Esempio n. 1
0
 def stemWord(self, fileName, preprocessedFileName=''):
     "Stemming word and write to temp file"
     p = PorterStemmer()
     print('Preprocessing...')
     print('Stemming words...')
     if len(preprocessedFileName) != 0:
         self.tempFileName = preprocessedFileName
     with open(self.tempFileName, 'w') as outputfile:
         with open(fileName, 'r') as file:
             while 1:
                 word = ''
                 line = file.readline()
                 if line == '':
                     break
                 # skip first word(category)
                 category = ''
                 for ch in line:
                     if ch == ' ':
                         if len(category) != 0:
                             outputfile.write(category + ' ')
                             break
                     else:
                         category += ch
                 # skip first word (category label)
                 for i in range(len(category) + 1, len(line)):
                     if line[i].isalpha():
                         word += line[i].lower()
                     else:
                         if word:
                             outputfile.write(p.stem(
                                 word, 0,
                                 len(word) - 1))
                             word = ''
                         outputfile.write(line[i].lower())
Esempio n. 2
0
def convert_keyboard_query():
    qry = raw_input("Type in your query:")
    words = qry.strip().split(' ')
    p = PorterStemmer()
    QUERY_WEIGHT = 2
    new_doc_vec = defaultdict(int)
    for word in words:
        word = word.strip()
        if re.search('[a-zA-Z]', word):
            word = word.lower()
            word = p.stem(word, 0, len(word) - 1)
            if word in new_doc_vec:
                new_doc_vec[word] += QUERY_WEIGHT
            elif word not in stoplist_hash and word in corp_freq_hash:
                new_doc_vec[word] = QUERY_WEIGHT
            else:
                continue

    new_vect = defaultdict(int)
    for key in new_doc_vec:
        new_vect[key] = new_doc_vec[key]
        if key in synonyms:
            sim_words_list = synonyms_list[synonyms[key]]
            for sim_word in sim_words_list:
                if sim_word not in stoplist_hash and re.search(
                        "[a-zA-z]", sim_word):
                    if corp_freq_hash[sim_word] > 1:
                        new_vect[sim_word] = new_doc_vec[key]

    return new_vect
Esempio n. 3
0
def Word_appear_count(text, type, Word_count_pubmed, Word_count_twitter, Word_count_all):
    text = remove_tag(text)
    word = text.split()
    p = PorterStemmer()
    for i in word:
        i = p.stem(i, 0, len(i) - 1)  # porter

        # pubmed
        if i not in Word_count_pubmed.keys():
            Word_count_pubmed[i] = 0
            if type == 'pubmed':
                Word_count_pubmed[i] += 1
        elif i in Word_count_pubmed.keys() and type == 'pubmed':
            Word_count_pubmed[i] += 1

        # twitter
        if i not in Word_count_twitter.keys():
            Word_count_twitter[i] = 0
            if type == 'twitter':
                Word_count_twitter[i] += 1
        elif i in Word_count_twitter.keys() and type == 'twitter':
            Word_count_twitter[i] += 1

        # all
        if i not in Word_count_all.keys():
            Word_count_all[i] = 1
        elif i in Word_count_all.keys():
            Word_count_all[i] += 1

    return Word_count_pubmed, Word_count_twitter,Word_count_all
Esempio n. 4
0
    def __init__(self, stop_words_file=""):
        self.word_doc_frequency = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
        self.doc_class_frequency = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
        self.total_words = 0

        self.class_word_frequency = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
        self.class_doc_frequency = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))

        self.stop_words = collections.defaultdict(lambda: False)
        self.stemmer = PorterStemmer()

        self.test_set = collections.defaultdict(lambda: [])
        self.total_doc_test_set = 0

        self.train_set = collections.defaultdict(lambda: [])
        self.total_doc_train_set = 0

        self.data_set_directory = ""

        if stop_words_file != "":
            with open(stop_words_file, 'r') as f:
                for line in f:
                    for w in line.split():
                        w = self.normalize(w)
                        self.stop_words[w] = True
    def __init__(self, parent, docno, doc, terms):
        QtGui.QDialog.__init__(self, parent)

        self.setupUi(self)

        # Set fields
        self.labelDocumentNo.setText(docno)

        textDocument = self.textEdit.document()
        textCursor = QtGui.QTextCursor(textDocument)

        normalFormat = QtGui.QTextCharFormat()
        termFormat = QtGui.QTextCharFormat()
        termFormat.setForeground(QtGui.QBrush(QtGui.QColor("red")))
        termFormat.setFontWeight(QtGui.QFont.Bold)

        textCursor.beginEditBlock()

        stemmer = PorterStemmer()
        terms = terms.split(",")
        stemmed_terms = [stemmer.stem(term, 0, len(term)-1) for term in terms]

        for line in unicode(doc).split("\n"):
            for word in line.split(" "):
                nword = word.lower().strip(punctuation)
                sword = stemmer.stem(nword, 0, len(nword)-1)
                if nword in terms or sword in stemmed_terms:
                    textCursor.insertText(word, termFormat)
                else:
                    textCursor.insertText(word, normalFormat)
                textCursor.insertText(" ", normalFormat)

            textCursor.insertText("\n", normalFormat)

        self.textEdit.moveCursor(QtGui.QTextCursor.Start)
Esempio n. 6
0
    def __init__(self, is_turbo=False):
      self.name = 'Leroy'
      self.userName = ''
      self.is_turbo = is_turbo
      self.stemmer = PorterStemmer()
      self.read_data()
      self.parsed_sentiment = dict()
      self.negationWords = ["didn't","not","no","don't"]
      self.punctuation = {"but",",",".","!",":",";"}
      self.strongPosVerbs = {"love","loved","adored","adore","enjoy","enjoyed"}
      self.strongPosAdjectives = {"amazing","cool","awesome","favorite"}
      self.strongNegVerbs = {"hate","hated","abhored","abhor","loathed","loathe","dispised","dispise"}
      self.strongNegAdjectives = {"apalling"}
      self.intensifiersSubject = {"really","reeally","extremely","absolutely"}
      self.intensifiersObject = {"really","reeally","very","extremely","remarkably","unusually","utterly","absolutely","exceptionally"}
      self.corrected_movie_trigger = False

      #For Two movie input
      self.similarity_words = {"either", "neither", "both", "and"}
      self.disimilarity_words = {"but"} #TODO: any more?

      self.userMovies = collections.defaultdict()
      self.userEmotions = [0,0,0,0,0] # anger, disgust, fear, joy, sadness
      self.movieDict = collections.defaultdict(lambda:0)
      self.genreDict = collections.defaultdict(lambda:0)
      self.movieIDToName = collections.defaultdict(lambda:0)
      self.movie_name_to_id()
      self.movie_history = []
      self.movie_recommendations = []
Esempio n. 7
0
    def __init__(self, creative=False):
      # The chatbot's default name is `moviebot`. Give your chatbot a new name.
      self.name = 'Lit!'

      self.creative = creative

      # This matrix has the following shape: num_movies x num_users
      # The values stored in each row i and column j is the rating for
      # movie i by user j
      self.titles, ratings = movielens.ratings()

      self.sentiment = {}
      self.porter_stemmer = PorterStemmer()
      sentimentCopy = movielens.sentiment()

      for k, v in sentimentCopy.items():
        key = self.porter_stemmer.stem(k)
        self.sentiment[key] = v


      self.user_ratings = []
      #############################################################################
      # TODO: Binarize the movie ratings matrix.                                  #
      #############################################################################
      ratings = self.binarize(ratings)
      # Binarize the movie ratings before storing the binarized matrix.
      self.ratings = ratings
Esempio n. 8
0
    def __init__(self, is_turbo=False):
        self.name = 'moviebot'
        self.is_turbo = is_turbo
        self.p = PorterStemmer()
        self.read_data()
        #   self.titles, self.ratings = ratings()
        self.binarize()
        self.RecommendationStrings = [
            "I think you should check out %s! ",
            "This movie will blow your mind: %s. ",
            "Watch %s. It will ruin all other movies for you. "
        ]

        self.ratedMovieList = {}
        self.userRatingVector = np.zeros(len(self.titles))
        self.recommendedMovies = []

        self.inTheMiddleOfSentimentAnalysis = False
        self.currentMovieForMoreInformation = ""

        self.TwoMoviesBoolean = False
        self.currentConjunction = ""
        self.sentimentOfPreviousMovie = 0
        self.check = {}
        self.distanceThreshold = 10
        self.confirm = False
        self.previousInput = ""
def process_word(token):
    token = token.lower()
    if constants.STEM is True:
        p = PorterStemmer()
        token = p.stem(token, 0,len(token)-1)                       
    
    return token
Esempio n. 10
0
 def __init__(self, is_turbo=False):
     self.name = 'IAN'
     self.is_turbo = is_turbo
     self.read_data()
     self.stemmer = PorterStemmer()
     self.counter = 0
     self.already_seen = []
     self.recommendations = []
     self.delimiters = [".", ",", ";", "!", "?", ":"]
     self.usersentiment = 0
     self.usermovie = ""
     self.clarify = 0
     self.check_spelling_flag = 0
     # build stemmed dictionary
     self.stemmed_sentiment = {}
     for word in self.sentiment.keys():
         self.stemmed_sentiment[self.stemmer.stem(word, 0,
                                                  len(word) -
                                                  1)] = self.sentiment[word]
     # build editCounts dictionary for spell checking
     self.editCounts = collections.defaultdict(list)
     with open("deps/count_1edit.txt") as f:
         for line in f:
             rule, countString = line.split("\t")
             originalText, editedText = rule.split("|")
             if self.editCounts[originalText] is None:
                 self.editCounts[originalText] = [(editedText,
                                                   int(countString))]
             else:
                 self.editCounts[originalText].append(
                     (editedText, int(countString)))
                 sorted(self.editCounts[originalText], key=lambda x: x[1])
     self.check_spelling = ""
     self.already_mentioned = []
     self.fromList = []
def stem_words(list_of_tokens):
    stemmer = PorterStemmer()  # Declares the stemmer object
    for token_index, token in enumerate(list_of_tokens):
        list_of_tokens[token_index] = stemmer.stem(
            token, 0,
            len(token) - 1)  # Stems the word using the function

    return list_of_tokens  # Returns the "post-stem" list of tokens
Esempio n. 12
0
def stem(tokens):
    p = PorterStemmer()
    stems = []
    for token in tokens:
        stem = p.stem(token, 0, len(token) - 1)
        stems.append(stem)

    return list(filter(None, stems))
Esempio n. 13
0
 def __init__(self):
     # For holding the data - initialized in read_data()
     self.titles = []
     self.docs = []
     self.vocab = []
     # For the text pre-processing.
     self.alphanum = re.compile('[^a-zA-Z0-9]')
     self.p = PorterStemmer()
def stemWords(inList):
##Function that stems the	words.
##Name: stemWords; input:	list (of tokens); output: list	(of stemmed tokens)
    outlist = []
    p = PorterStemmer()
    for word in inList:
        outlist.append(p.stem(word, 0, len(word)-1))
    return outlist
Esempio n. 15
0
    def __init__(self, creative=False):
        # The chatbot's default name is `moviebot`. Give your chatbot a new name.
        self.name = 'moviebot'

        self.creative = creative

        # This matrix has the following shape: num_movies x num_users
        # The values stored in each row i and column j is the rating for
        # movie i by user j
        self.titles, ratings = movielens.ratings()
        self.sentiment = movielens.sentiment()
        self.new_sentiment = {}
        self.p = PorterStemmer()

        # create a new sentiment dict with stemmed keys
        for key in self.sentiment:
            new_key = self.p.stem(key)
            self.new_sentiment[new_key] = self.sentiment[key]

        self.bin_ratings = self.binarize(ratings)

        # a tuple with the sentiment of the movie being discussed
        self.current_sentiment = None
        # the movie title entered by the user
        self.current_title = None
        # a list of current movie candidates
        self.current_idxs = []

        self.prev_movie = None
        self.prev_sentiment = None

        # a dict where dict[i] = j is the user's sentiment j for movie index i
        # for movies that the user has described and the chatbot has processed
        self.user_movies = {}

        # a set of movie indexes that the user has already described
        self.user_movie_set = set()

        self.prefix_match_found = False
        self.disambig = False

        # if chatbot is in recommend mode, only respond to yes or no
        self.recommend_mode = False

        # a list of recommendations for the user
        self.recommendations = []
        self.recommend_idx = 0

        # preprocess movie list by extracting possible titles and year
        self.movies = []
        for entry in self.titles:
            self.movies.append(extract_titles_and_year(entry[0]))
        #############################################################################
        # TODO: Binarize the movie ratings matrix.                                  #
        #############################################################################

        # Binarize the movie ratings before storing the binarized matrix.
        self.ratings = ratings
 def stemming(self, tokens):
     stemmed_tokens = []
     stem_func = PorterStemmer()
     for c in tokens:
         if c.isalpha():
             stemmed_tokens.append(stem_func.stem(c, 0,len(c)-1))
         else:
             stemmed_tokens.append(c)
     return stemmed_tokens
Esempio n. 17
0
 def stemInputAndCheckMatch(self, uType, word):
     ps = PorterStemmer()
     stemmedWord = ps.stem(word)
     matchingWords = self.checkMatches(uType, stemmedWord)
     data = self.getMostFrequentWords(matchingWords)
     if (data[1] != 1):
         return data[0]
     else:
         return []
Esempio n. 18
0
    def __init__(self, path, num_records):
        self.porter = PorterStemmer()
        self.stop = set()
        with open('stop.words.dat', 'r') as sw:
            for line in sw:
                self.stop.add(line[:-1])

        if path != '' and num_records != 0:
            self.process(path, num_records)
def load_dictionary(filename, stem=True):
    """Loads line separated dictionary into a list"""
    out = []
    for word in open("dictionaries/%s" % filename, "r"):
        word = word.lower()
        if stem is True:
            p = PorterStemmer()
            word = p.stem(word, 0,len(word)-1)               
        out.append(word)
    return out
Esempio n. 20
0
 def __init__(self):
     self.pStemmer = PorterStemmer()
     self.num2Word = NumberToWord()
     self.stopWords = [
         "i", "me", "my", "we", "the", "on", "and", "in", "to", "s", "t",
         "a", "an", "at", "of", "is", "or", "by", "it", "as", "be"
     ]
     self.sOutput = []
     self.artistNames = []
     self.albumNames = []
Esempio n. 21
0
    def __init__(self):
        self.vocabulary = []
        self.invertedIndex = {}
        self.documents = []
        self.documentsUnstemmed = []
        self.tfidf = {}  # {word: { docId: tfidf}}
        self.docIdToFilename = {}

        self.stemmer = PorterStemmer()
        self.stopWords = []
Esempio n. 22
0
    def preprocess(self, query):
        p = PorterStemmer()
        result = []

        # remove any non-alphanumeric characters [a-zA-Z0-9_]
        query = re.sub("[^\w]", " ", query)
        query = query.lower().split(' ')
        for word in query:
            if word not in self.stopwords:
                result.append(p.stem(word, 0, len(word) - 1))
        return result
Esempio n. 23
0
 def __init__(self):
     '''
   self.saver    : Pickles the object on to the disk.
   self.ps       : Porter stemmer class object. It is required to get stem of a word.
   self.st       : Class object to check for stop words.
 '''
     self.saver = save_object()
     self.ps = PorterStemmer()
     self.st = stopwords()
     self.m_ds = {}
     self.models = {}
     self.m_mod = model()
def format_description(text, stop_words):
    words = text.split()

    stemmer = PorterStemmer()
    non_stop_words = []
    for word in words:
        if word not in stop_words:      # Not a stop word, so lower, remove punctuation, and stem
            lowered_token = remove_punctuation(word).lower()
            #non_stop_words.append(lowered_token)
            non_stop_words.append(stemmer.stem(lowered_token))

    return ' '.join(non_stop_words)
Esempio n. 25
0
def finalize(tInput, swInput):
    p = PorterStemmer()
    output = open("output.txt", 'w')
    for i in range(len(tInput)):
        token = tInput[i]
        if token == "a" or token == "an" or token == "the":
            output.write("%s\t- article\n" % token)
        elif any(token in x for x in swInput):
            output.write("%s\t- stop word\n" % token)
        else:
            stemword = p.stem(token, 0, len(token) - 1)
            output.write("%s\t- %s\n" % (token, stemword))
    output.close()
Esempio n. 26
0
 def stem_text(text):
     p = PorterStemmer()
     stemmed_text = ''
     word = ''
     for i, c in enumerate(text):
         if c.isalpha():
             word += c.lower()
         if not c.isalpha() or i == (len(text) - 1):
             if word:
                 stemmed_text += p.stem(word, 0,len(word)-1)
                 word = ''
             if c.lower() == ' ':
                 stemmed_text += c.lower()
     return stemmed_text
Esempio n. 27
0
    def __init__(self, stop_words_file=""):
        self.stop_words = collections.defaultdict(lambda: False)
        self.stemmer = PorterStemmer()
        self.map_trans_terms = collections.defaultdict(lambda: 0)
        self.map_trans_docs = collections.defaultdict(lambda: 0)
        self.total_words = 0

        if stop_words_file != "":
            with open(stop_words_file, 'r') as f:
                for line in f:
                    for w in line.split():
                        w = self.normalize(w)
                        self.stop_words[w] = True

        self.BLOCK_SIZE = 100  # Number of entity, each record is a (term, docID)
Esempio n. 28
0
def remove_porterstemmer(input_file, noise_words_set):
    questions = list()
    word_weight = []
    p = PorterStemmer()
    for line in input_file:
        line = line.lower()
        words = filter(None, re.split("\W*\d*", line))
        question = []
        for word in words:
            new_word = p.stem(word, 0, len(word) - 1)
            if new_word not in noise_words_set and len(new_word) > 2:
                question.append(new_word)
        questions.append(question)
        word_weight.append(Counter(question))
    return word_weight, questions
def remove_porterstemmer(input_file,noise_words_set):
	questions = list()
	word_weight = []
	p = PorterStemmer()
	for line in input_file:
		line = line.lower()
		words = filter(None, re.split("\W*\d*", line))
		question = []
		for word in words:
			new_word = p.stem(word,0,len(word)-1)
			if new_word not in noise_words_set and len(new_word)>2:
				question.append(new_word)
		questions.append(question)
		word_weight.append(Counter(question))
	return word_weight, questions
Esempio n. 30
0
def en_preprocess(file_path: str, stop_words: list, step: int = 4) -> str:
    '''
    Step1: Extract pure-text content from the original html file
    Step2: To lower case, remove special characters
    Step3: Remove stop words
    Step4: Porter stemming (Final result)
    '''
    with open(file_path, "r", encoding="UTF-8") as f:
        html_content = f.read()
        parsed_content = BeautifulSoup(html_content, 'html.parser')
        text_content = ""
        # Extract pure-text content from the original html file
        for child in parsed_content.find(id="mw-content-text").div.children:
            if child.name in ("p", "h2", "h3", "h4", "h5"):
                text_content += child.get_text()
        if step == 1:
            return text_content
        # To lower case
        text_content = text_content.lower()
        # Remove special characters
        text_content = text_content.replace("'", "")
        text_content = text_content.replace("-", "")
        for i in range(len(text_content)):
            curr_char = text_content[i]
            if not ((curr_char >= 'a' and curr_char <= 'z')):
                text_content = text_content.replace(curr_char, " ")
        # Remove duplicated spaces
        text_content = re.sub("[ ]+", " ", text_content)
        if step == 2:
            return text_content
        # Tokenize
        token_list = text_content.split(" ")
        # Remove stop words
        new_list = []
        for token in token_list:
            if token not in stop_words and token != "":
                new_list.append(token)
        token_list = new_list
        if step == 3:
            return " ".join(token_list)
        # Porter stemming
        p = PorterStemmer()
        new_list = []
        for i in range(len(token_list)):
            new_list.append(p.stem(token_list[i], 0, len(token_list[i]) - 1))
        token_list = new_list
        final_result = " ".join(token_list)
        return final_result
Esempio n. 31
0
 def __init__(self, is_turbo=False):
   self.name = 'Moviebot'
   self.is_turbo = is_turbo
   self.is_binarized = True
   self.p = PorterStemmer()
   self.punctuation = set([",", ".","?","!",":",'"',"'","(",")"])
   self.endPunctuation = set([".","?","!",":",'"',"'","(",")"])
   self.negateWords = set(["not", "no", "never", "neither", "nor"])
   self.prevNegateWords = set(["but", "although", "however", "yet"])
   self.extremeWords = set(["very", "really", "extremely"])
   self.movies = {}
   self.movie_to_index_dict = {}
   self.alternate_titles_dict = {}
   self.movie_scores = []
   self.non_binarized_matrix = {}
   self.read_data()
    def __init__(self, pathData, pathID, pathQuery, uniqueT, isStemming):
        """

        :param pathData: string of pathData
        :param pathID: string of pathID
        :param pathQuery: string of pathQuery
        :param uniqueT: integer of unique term
        :param isStemming: True if using stemming, False if not using stemming
        """
        self.__buildID(pathID)
        if isStemming:
            self.__stemmer = PorterStemmer()
        self.__buildQuery(pathQuery, isStemming)
        self.__uniqueTerm = uniqueT

        self.__buildData(pathData)
def getStemWords(query_line, stopwords):
    raw_data = query_line.replace(".", "").replace(",", "").replace('"', "").replace("\n", "").replace("-", " ") \
        .replace("(", "").replace(")", "").split(" ")

    for i in stopwords:
        while i in raw_data:
            raw_data.remove(i)

    stemmedArray = raw_data
    p = PorterStemmer()

    for i in range(1, stemmedArray.__len__()):
        while stemmedArray[i] != p.stem(stemmedArray[i], 0, len(stemmedArray[i]) - 1):
            stemmedArray[i] = p.stem(stemmedArray[i], 0, len(stemmedArray[i]) - 1)

    return raw_data[0], raw_data[1:], stemmedArray[1:]
Esempio n. 34
0
class Parser:

	#A processor for removing the commoner morphological and inflexional endings from words in English
	stemmer=None

	stopwords=[]

	def __init__(self,):
		self.stemmer = PorterStemmer()

		#English stopwords from ftp://ftp.cs.cornell.edu/pub/smart/english.stop
		#self.stopwords = open('data/english.stop', 'r').read().split()


	def clean(self, string):
		""" remove any nasty grammar tokens from string """
		string = string.replace(".","")
		string = string.replace("\s+"," ")
		string = string.lower()
		return string
	

	def removeStopWords(self,list):
		""" Remove common words which have no search value """
		return [word for word in list if word not in self.stopwords ]


	def tokenise(self, string):
		""" break string up into tokens and stem words """
		string = self.clean(string)
		words = string.split(" ")
		
		return [self.stemmer.stem(word,0,len(word)-1) for word in words]
Esempio n. 35
0
def tokenize(documents):
    # Read the stopwords
    stop_word_set = set(open('./stopwords.txt', 'r').read().split())
    # Initialize the Porter stemmer
    p = PorterStemmer()
    # Create a dictionary where each element is also a dictionary. The outer dictionary will map stemmed words to
    # document ids and the inner dictionaries will map the document ids to their indices in the document.
    word_to_doc = defaultdict(lambda: defaultdict(list))  # Positional inverted index
    for document_index, document in enumerate(documents, start=1):
        for word_index, word in enumerate(document.split()):
            if word not in stop_word_set:
                # Store each word as stemmed and put them to the inverted index
                stemmed_word = p.stem(word, 0, len(word) - 1)
                # stemmed_word = word
                word_to_doc[stemmed_word][document_index].append(word_index)
    return word_to_doc
Esempio n. 36
0
    def __init__(self, is_turbo=False):
      self.name = 'moviebot'
      self.is_turbo = is_turbo
      #Initialize relevant classes
      self.stemmer = PorterStemmer()
      self.sentiment = {}
      self.read_data()
      #User data
      self.response_indexes = {}
      #Read in data
      self.responses = self.readInFile('deps/responses.txt', False)
      self.articles = ['the', 'a', 'an']
      self.negations = self.readInFile('deps/negations.txt', True)
      self.punctuation = '.,?!-;'
      self.no_words = self.readInFile('deps/no_words.txt', True)
      self.yes_words = self.readInFile('deps/yes_words.txt', True)
      self.findpatterns = [
      #patterns for finding movies without quotes
      '\"(.*?)\"',
      'movie.*?(?:was|is|start(?:ed|s)|end(?:ed|s)) (.*)',
      '(?:I (?:think|thought|feel|felt) | watching )?(.*?) (?:was|is|start(?:ed|s)|end(?:ed|s)) .*?',
      'I .*? watching (.*)',
      'I .*?(?:watch|enjoy|hat|(?:dis)?lik|lov)ed (.*)'
       ]

      #Read in fine-sentiment data
      self.intensifiers = self.readInFile('deps/intensifiers.txt', True)
      self.strong_negative = self.readInFile('deps/strong_negative.txt', True)
      self.strong_negative = [self.stemmer.stem(word) for word in self.strong_negative]
      self.strong_positive = self.readInFile('deps/strong_positive.txt', True)
      self.strong_positive = [self.stemmer.stem(word) for word in self.strong_positive]

      #Binarize ratings matrix
      self.binarize()
      self.justGaveRec = False
      #Initialize relevant vars
      self.recommendations = []
      self.INFO_THRESHOLD = 5
      #Pre-process titles, ratings to make later work more efficient.
      self.titles_map = self.processTitles(self.titles)
      ## Remember which movies were mentioned without an explicit sentiment
      self.mentioned_movies = []
      self.justFollowedUp = False
      self.checkingDisamb = False

      self.prevEmotion = 0
      self.emotionWords = self.readInEmotions()
Esempio n. 37
0
def search_dic(text, SearDic, original_word, index):
    text = remove_tag(text)
    word = text.split()
    p = PorterStemmer()
    for i in word:

        # poter_i = i
        poter_i = p.stem(i, 0, len(i) - 1)  # porter
        if poter_i not in SearDic.keys():
            SearDic[poter_i] = [index]
            original_word[poter_i] = [i]
        else:
            if index not in SearDic[poter_i]:
                SearDic[poter_i].append(index)
                if i not in original_word[poter_i]:
                    original_word[poter_i].append(i)
    return SearDic, original_word
Esempio n. 38
0
 def __init__(self):
     # For holding the data - initialized in read_data()
     self.titles = []
     self.docs = []
     self.vocab = []
     # For the text pre-processing.
     self.alphanum = re.compile('[^a-zA-Z0-9]')
     self.p = PorterStemmer()
Esempio n. 39
0
def dict_qryid_terms(is_stopping):
  global STOPWORDS_FILE 
  stopwords_list = stopwords(STOPWORDS_FILE)  ## create stopwords list
  p = PorterStemmer() ##create an Porter Stemmer instance 
  dictquery = defaultdict(lambda: [])  ## create the target dictionary
  with open(QUERY_TEXT_FILE, 'r') as f: 
    for line in f: 
      data_list = re.findall(r"[\w]+", line)
      query_id = data_list[0]
      for term in data_list[1:]:
        term = term.lower()
        if is_stopping:
          if term not in stopwords_list:
            dictquery[query_id].append(p.stem(term, 0,len(term)-1))
        else: 
            dictquery[query_id].append(p.stem(term, 0,len(term)-1))
  return dictquery
def dict_qryid_terms(is_stopping):
  global STOPWORDS_FILE 
  stopwords_list = stopwords(STOPWORDS_FILE)  ## create stopwords list
  p = PorterStemmer() ##create an Porter Stemmer instance 
  dictquery = defaultdict(lambda: [])  ## create the target dictionary
  with open(QUERY_TEXT_FILE, 'r') as f: 
    for line in f: 
      data_list = re.findall(r"[\w]+", line)
      query_id = data_list[0]
      for term in data_list[1:]:
        term = term.lower()
        if is_stopping:
          if term not in stopwords_list:
            dictquery[query_id].append(p.stem(term, 0,len(term)-1))
        else: 
            dictquery[query_id].append(p.stem(term, 0,len(term)-1))
  return dictquery
Esempio n. 41
0
def stem_string(line):
    if line == "":
        return ""
    p = PorterStemmer()
    word = ""
    output = ""
    for c in line:
        if c.isalpha():
            word += c.lower()
        else:
            if word:
                output += p.stem(word, 0,len(word)-1)
                word = ''
            output += c.lower()
    if word:
        output += p.stem(word, 0,len(word)-1)
    return output
Esempio n. 42
0
def getQuestionKeywords(question):
    """Return the keywords from a question.

    The logic is: remove the stop words and punctuations from question, stem the keywords and remove duplicates
    Currently there are still issues with
    1. stop words list is not complete: eg "recommend" etc is not a stop word.
    2. stemmer issue: The current stemmer utility has an issue eg "restaurant" is stemmed to "restau"

    >>> getQuestionKeywords('what is the best preschool in Potomac?')
    ['potomac', 'preschool']

    >>> getQuestionKeywords('Can someone help with a preschool around potomac?')
    ['potomac', 'preschool']

    >>> getQuestionKeywords('What is the best cafeteria around potomac?')
    ['potomac', 'restaurant']

    """

    # split the question into a list
    keywordList = question.split()

    # strip the punctuations etc
    keywordList = [keyword.strip(PUNCTUATION) for keyword in keywordList]

    # convert into lower case
    keywordList = [keyword.lower() for keyword in keywordList]

    #remove stop words from keywords
    keywordList = [keyword for keyword in keywordList if keyword not in stopWords]

    #stem the keywords
    stemmer = PorterStemmer()
    keywordList = [stemmer.stem(keyword,0,len(keyword)-1) for keyword in keywordList]

    #take care of synonyms
    keywordList = [synonyms[keyword] if keyword in synonyms else keyword for keyword in keywordList ]

    #remove duplicates
    keywordList = list(set(keywordList))

    #sort the keywords
    keywordList.sort()
    
    return keywordList
Esempio n. 43
0
 def parse(self):
     #remove stop words
     self.dataList = [w for w in self.dataList if not w in self.stopWords]
     #get the stem of the words
     st = PorterStemmer()
     self.dataList = [st.stem(w, 0, len(w)-1) for w in self.dataList]        
     # add to list based on frequency of occurrence
     wordFreq = {}
     for word in self.dataList:
         if word in wordFreq:
             wordFreq[word] = wordFreq[word] + 1
         else:
             wordFreq[word] = 0
     wordList = sorted(wordFreq.iteritems(), key = operator.itemgetter(1))
     newList = []
     for w in wordList:
         newList.insert(0,w[0])
     self.dataList = newList
Esempio n. 44
0
    def __init__(self, path, num_records):
        self.porter = PorterStemmer()
        self.stop = set()
        with open("stop.words.dat", "r") as sw:
            for line in sw:
                self.stop.add(line[:-1])

        if path != "" and num_records != 0:
            self.process(path, num_records)
Esempio n. 45
0
    def __init__(self):
        self.stoplist = open('stopword_list.txt', 'r').read().split()
        self.porter = PorterStemmer()
        doc2id = pickle.load(open('doc2id.pkl', 'rb'))
        self.id2doc = {v:k for k, v in doc2id.items()}
        
        self.index = pickle.load(open('index.pkl', 'rb'))
        self.pos_index = pickle.load(open('pos_index.pkl', 'rb'))

        self.idf_new_term = log(len(doc2id)/0.5, 2)
Esempio n. 46
0
def getTopTerms(currentQuery, weightsMap, topX):

    p = PorterStemmer()
    current_terms = []
    for term in currentQuery.split():
        term = p.stem(term.lower(), 0,len(term)-1)
        current_terms.append(term)    

    i = 0
    new_terms = []
    for term in sorted(weightsMap, key=weightsMap.get, reverse=True):
        if term in constants.QUERY_SKIP_TERMS or p.stem(term.lower(), 0,len(term)-1) in current_terms:
            continue
        new_terms.append(term)
        current_terms.append(p.stem(term.lower(), 0,len(term)-1))
        i = i + 1
        if (topX != 'ALL' and i >= topX):
            break;
    return new_terms
Esempio n. 47
0
	def classify(self, query):
		if self.isSuicide(query):
			return [('suicidal ideation', 1), ('depression', .5), ('emotional disturbance', .5)]

		query = "".join(c for c in query if c not in ('!','.',':',',',';','?')).lower()
		query_words = query.split() 
		p = PorterStemmer()
		query_words = [p.stem(query_words[i]) for i in range(len(query_words))]
		q = np.zeros(len(self.word_to_index))
		for word in query_words:
			if word in self.word_to_index:
				q[self.word_to_index[word]] += self.idf[self.word_to_index[word]]

		membership_scores = []
		for i in range(len(self.tfidf_matrix)):
			#compute cosine similarity
			docvec = self.tfidf_matrix[i]
			cossim = (np.inner(docvec, q)/(np.linalg.norm(docvec)*np.linalg.norm(q))).item(0,0)
			membership_scores.append(cossim)
		return sorted(zip(self.categories, membership_scores), key=lambda x: x[1], reverse=True)
Esempio n. 48
0
    def __init__(self):
        self.dname2id = pickle.load(open('doc2id.pkl', 'rb'))
        try:
            f = open('stopword_list.txt', 'r')
        except IOError:
            raise 'Failed to open stopword_list.txt.'

        self.stoplist = f.read().split()
        self.porter = PorterStemmer()
        ## term to its posting list.
        self.index = {}
        self.pos_index = defaultdict(list)
        self.doc_num = len(self.dname2id)
def dicts_docid_words_docid_doclen():
  global STOPWORDS_FILE 
  p = PorterStemmer() 
  stopwords_list = stopwords(STOPWORDS_FILE)
  docid_words_dict = defaultdict(lambda: [])
  docid_doclen_dict = {}
  path = CACM_PATH
  """extract all the file names in the path and put them into a list"""
  dirs_list = os.listdir(path)
  for docname in dirs_list:
    docno = ''.join([s for s in docname if s.isdigit()])
    f = urllib.urlopen(path+docname).read()
    data = re.compile(r'.*?<pre>(.*?)([0-9]+\t[0-9]+\t[0-9]+)', re.DOTALL).match(f).group(1)
    data = re.findall(r"[\w]+", data)
    for word in data:
      word = word.lower()  
      if word not in stopwords_list:  
        word_stemmed = p.stem(word, 0,len(word)-1)
        docid_words_dict[docno].append(word_stemmed)
    """doclen is the length of doc after stopping and stemming"""
    docid_doclen_dict[docno]=len(data)  
  return docid_words_dict,docid_doclen_dict
Esempio n. 50
0
class Tokenizer:
    """ Helper class for tokenizing document space and removing stop words """

    corpus = None
    terms = []
    stop_words = []
    stemmer = None

    def __init__(self):

        # read stop words from file
        self.stop_words = open('stop_words.txt', 'r').read().split()
        self.stemmer = PorterStemmer()

    def tokenize(self, docs_string):
        """ Tokenizer's most important method.
        It separates the whole corpus string in tokens and
        removes stop words.
        """
        self.corpus = docs_string

        self.clean()

        self.terms = self.corpus.split(" ")

        self.remove_stop_words()

        self.remove_duplicates()

        return self.terms

    def clean(self):
        """ get rid of punctuation signs, convert to lower case, standardize spacing """
        self.corpus = self.corpus.replace(".", " ")
        self.corpus = self.corpus.replace(",", " ")
        self.corpus = self.corpus.lower()
        self.corpus = self.corpus.replace("\s+", " ")

    def remove_stop_words(self):
        self.terms = [self.stemmer.stem(term,0,len(term)-1) for term in self.terms if term not in self.stop_words]

    def remove_duplicates(self):
        """ remove duplicated terms in the list """
        from sets import Set
        self.terms = Set((term for term in self.terms))
Esempio n. 51
0
class Processor:
    def __init__(self, path, num_records):
        self.porter = PorterStemmer()
        self.stop = set()
        with open("stop.words.dat", "r") as sw:
            for line in sw:
                self.stop.add(line[:-1])

        if path != "" and num_records != 0:
            self.process(path, num_records)

    def process(self, path, num_records):
        with open(path, "r", encoding="utf-8") as src:
            with open("sample.txt", "w") as dst:
                num_total = 0
                for line in src:
                    AnonID, Query, QueryTime = line.split("\t")[:3]

                    if AnonID == "AnonID":
                        continue

                    if num_total < num_records:
                        tidy = self.trim(Query)
                        if tidy != "":
                            Query = self.remove_stop_words(tidy)
                            Query = self.porter_stemming(Query)
                            if Query != "":
                                dst.write("{}\t{}\t{}\n".format(AnonID, Query, QueryTime))
                                num_total += 1

    def trim(self, string):
        return re.sub(r"\W", " ", string)

    def remove_stop_words(self, string):
        words = string.split()
        return " ".join([w for w in words if w not in self.stop])

    def porter_stemming(self, string):
        result = [self.porter.stem(word, 0, len(word) - 1) for word in string.split()]
        return " ".join(result)
Esempio n. 52
0
class Indexer(object):

    def __init__(self):
        self.dname2id = pickle.load(open('doc2id.pkl', 'rb'))
        try:
            f = open('stopword_list.txt', 'r')
        except IOError:
            raise 'Failed to open stopword_list.txt.'

        self.stoplist = f.read().split()
        self.porter = PorterStemmer()
        ## term to its posting list.
        self.index = {}
        self.pos_index = defaultdict(list)
        self.doc_num = len(self.dname2id)

    def terms_for_keywords_query(self, terms):
        ## Filter out stop words.
        return [t for t in terms if t not in self.stoplist]

    def get_terms(self, contents):
        terms = contents.split()
        terms = map(del_punc, terms)
        terms = map(lambda s : s.lower(), terms)

        ## Terms for keywords based query(aka: free text query).
        terms_for_kq = [self.porter.stem(term, 0, len(term)-1) for term in self.terms_for_keywords_query(terms)]

        ## Terms for phrase query.
        terms_for_pq = [self.porter.stem(term, 0, len(term)-1) for term in terms]

        return terms_for_kq, terms_for_pq

    def get_doc_id(self, dname):
        return self.dname2id[dname]

    def build_posting_list_for_pq(self, terms, doc_id):
        """
        Build posting list(term : [doc, [positions]]) for phrase query.
        """
        term2doc_pos = {}
        for pos, term in enumerate(terms):
            try:
                term2doc_pos[term][1].append(pos)
            except:
                term2doc_pos[term] = [doc_id, [pos]]

        for term, posting in term2doc_pos.iteritems():
            self.pos_index[term].append(posting)

    def build_posting_list_for_kq(self, terms, doc_id):
        """
        Build posting list(term : [idf, [(doc1, tf), (doc2, tf), ...]]) for keywords based query.
        """
        tf_counter = Counter(terms)
        max_elem = tf_counter.most_common(1)
        most_common_term = max_elem[0][0]
        max_tf = max_elem[0][1]
        # print 'Most common term is:', most_common_term, '\tMax tf is:', max_tf

        for term, tf in tf_counter.iteritems():
            if not self.index.has_key(term):
                df = 1
                self.index[term] = [df, [(doc_id, float(tf)/max_tf)]]
            else:
                df = self.index[term][0]
                df += 1
                self.index[term][0] = df
                self.index[term][1].append((doc_id, float(tf)/max_tf))

    def write_index_to_file(self):
        pickle.dump(self.index, open('index.pkl', 'wb'))
        pickle.dump(self.pos_index, open('pos_index.pkl', 'wb'))

    def compute_idf(self):
        for term, postings in self.index.iteritems():
            postings[0] = log(float(self.doc_num)/postings[0], 2)

    def parse_collection(self):

        stdout_old = sys.stdout
        sys.stdout = open('indexer_log', 'w')
        print 'Total %d documents need to be processed.' % self.doc_num

        for index, (doc_name, doc_id) in enumerate(sorted(self.dname2id.iteritems(), key=itemgetter(1))):
            try:
                print 'Building index for:', os.path.basename(doc_name),
                print '\tDocument ID:', doc_id
                f = open(doc_name, 'r')
            except IOError:
                raise 'Unable to open document [%s]' % doc_name

            ## Get terms for keywords based query and phrase based query.
            terms_for_kq, terms_for_pq = self.get_terms(f.read())
            
            self.build_posting_list_for_kq(terms_for_kq, doc_id)
            self.build_posting_list_for_pq(terms_for_pq, doc_id)

        self.compute_idf()
        self.write_index_to_file()

        sys.stdout = stdout_old
Esempio n. 53
0
    def __init__(self):

        # read stop words from file
        self.stop_words = open('stop_words.txt', 'r').read().split()
        self.stemmer = PorterStemmer()
Esempio n. 54
0
class IRSystem:

    def __init__(self):
        # For holding the data - initialized in read_data()
        self.titles = []
        self.docs = []
        self.vocab = []
        # For the text pre-processing.
        self.alphanum = re.compile('[^a-zA-Z0-9]')
        self.p = PorterStemmer()


    def get_uniq_words(self):
        uniq = set()
        for doc in self.docs:
            for word in doc:
                uniq.add(word)
        return uniq


    def __read_raw_data(self, dirname):
        print "Stemming Documents..."

        titles = []
        docs = []
        os.mkdir('%s/stemmed' % dirname)
        title_pattern = re.compile('(.*) \d+\.txt')

        # make sure we're only getting the files we actually want
        filenames = []
        for filename in os.listdir('%s/raw' % dirname):
            if filename.endswith(".txt") and not filename.startswith("."):
                filenames.append(filename)

        for i, filename in enumerate(filenames):
            title = title_pattern.search(filename).group(1)
            print "    Doc %d of %d: %s" % (i+1, len(filenames), title)
            titles.append(title)
            contents = []
            f = open('%s/raw/%s' % (dirname, filename), 'r')
            of = open('%s/stemmed/%s.txt' % (dirname, title), 'w')
            for line in f:
                # make sure everything is lower case
                line = line.lower()
                # split on whitespace
                line = [xx.strip() for xx in line.split()]
                # remove non alphanumeric characters
                line = [self.alphanum.sub('', xx) for xx in line]
                # remove any words that are now empty
                line = [xx for xx in line if xx != '']
                # stem words
                line = [self.p.stem(xx) for xx in line]
                # add to the document's conents
                contents.extend(line)
                if len(line) > 0:
                    of.write(" ".join(line))
                    of.write('\n')
            f.close()
            of.close()
            docs.append(contents)
        return titles, docs


    def __read_stemmed_data(self, dirname):
        print "Already stemmed!"
        titles = []
        docs = []

        # make sure we're only getting the files we actually want
        filenames = []
        for filename in os.listdir('%s/stemmed' % dirname):
            if filename.endswith(".txt") and not filename.startswith("."):
                filenames.append(filename)

        if len(filenames) != 60:
            msg = "There are not 60 documents in ../data/RiderHaggard/stemmed/\n"
            msg += "Remove ../data/RiderHaggard/stemmed/ directory and re-run."
            raise Exception(msg)

        for i, filename in enumerate(filenames):
            title = filename.split('.')[0]
            titles.append(title)
            contents = []
            f = open('%s/stemmed/%s' % (dirname, filename), 'r')
            for line in f:
                # split on whitespace
                line = [xx.strip() for xx in line.split()]
                # add to the document's conents
                contents.extend(line)
            f.close()
            docs.append(contents)

        return titles, docs


    def read_data(self, dirname):
        """
        Given the location of the 'data' directory, reads in the documents to
        be indexed.
        """
        # NOTE: We cache stemmed documents for speed
        #       (i.e. write to files in new 'stemmed/' dir).

        print "Reading in documents..."
        # dict mapping file names to list of "words" (tokens)
        filenames = os.listdir(dirname)
        subdirs = os.listdir(dirname)
        if 'stemmed' in subdirs:
            titles, docs = self.__read_stemmed_data(dirname)
        else:
            titles, docs = self.__read_raw_data(dirname)

        # Sort document alphabetically by title to ensure we have the proper
        # document indices when referring to them.
        ordering = [idx for idx, title in sorted(enumerate(titles),
            key = lambda xx : xx[1])]

        self.titles = []
        self.docs = []
        numdocs = len(docs)
        for d in range(numdocs):
            self.titles.append(titles[ordering[d]])
            self.docs.append(docs[ordering[d]])

        # Get the vocabulary.
        self.vocab = [xx for xx in self.get_uniq_words()]


    def compute_tfidf(self):
        # -------------------------------------------------------------------
        # TODO: Compute and store TF-IDF values for words and documents.
        #       Recall that you can make use of:
        #         * self.vocab: a list of all distinct (stemmed) words
        #         * self.docs: a list of lists, where the i-th document is
        #                   self.docs[i] => ['word1', 'word2', ..., 'wordN']
        #       NOTE that you probably do *not* want to store a value for every
        #       word-document pair, but rather just for those pairs where a
        #       word actually occurs in the document.

        print "Calculating tf-idf..."
        self.tfidf = {}
        
        # initialized
        for word in self.vocab:
            for d in range(len(self.docs)):
                if word not in self.tfidf:
                    self.tfidf[word] = {}
                self.tfidf[word][d] = 0.0
        
        N = len(self.docs)
        for word in self.vocab:
            indices = self.inv_index[word]
            for i in indices:                
                tf = 1 + math.log10(indices[i])
                idf = math.log10(N*1.0 / len(self.get_posting(word)))
                self.tfidf[word][i] = tf * idf
        
        #print self.tfidf
        # ------------------------------------------------------------------


    def get_tfidf(self, word, document):
        # ------------------------------------------------------------------
        # TODO: Return the tf-idf weigthing for the given word (string) and
        #       document index.
        tfidf = 0.0
        
        if word in self.tfidf:
            tfidf = self.tfidf[word][document]
        
        # ------------------------------------------------------------------
        return tfidf


    def get_tfidf_unstemmed(self, word, document):
        """
        This function gets the TF-IDF of an *unstemmed* word in a document.
        Stems the word and then calls get_tfidf. You should *not* need to
        change this interface, but it is necessary for submission.
        """
        word = self.p.stem(word)
        return self.get_tfidf(word, document)


    def index(self):
        """
        Build an index of the documents.
        """
        print "Indexing..."
        # ------------------------------------------------------------------
        # TODO: Create an inverted index.
        #       Granted this may not be a linked list as in a proper
        #       implementation.
        #       Some helpful instance variables:
        #         * self.docs = List of documents
        #         * self.titles = List of titles

        # Example: inv_index['separ'] = {54: 3}  in doc id 54, occurs 3 times!
        
        inv_index = {}
                        
        for word in self.vocab:
            inv_index[word] = {}

        numdocs = len(self.docs)
        
        for d in xrange(0, numdocs):
            doc = self.docs[d]
            for word in doc:
                #if word == "zulu":
                #    print "zulu", inv_index[word]
                    
                if d in inv_index[word]:
                    inv_index[word][d] = inv_index[word][d]+1
                else:                    
                    inv_index[word][d] = 1


        #print inv_index['separ']
        #print "zulu inverted index", inv_index['zulu']
        #print inv_index
        self.inv_index = inv_index

        # ------------------------------------------------------------------


    def get_posting(self, word):
        """
        Given a word, this returns the list of document indices (sorted) in
        which the word occurs.
        """
        # ------------------------------------------------------------------
        # TODO: return the list of postings for a word.
        posting = []
        
        for i in self.inv_index[word]:
            posting.append(i)
            
        posting.sort()
        
        #if word == "zulu":
        #    print "posting for word", word , posting
        
        return posting
        # ------------------------------------------------------------------


    def get_posting_unstemmed(self, word):
        """
        Given a word, this *stems* the word and then calls get_posting on the
        stemmed word to get its postings list. You should *not* need to change
        this function. It is needed for submission.
        """
        word = self.p.stem(word)
        return self.get_posting(word)


    def boolean_retrieve(self, query):
        """
        Given a query in the form of a list of *stemmed* words, this returns
        the list of documents in which *all* of those words occur (ie an AND
        query).
        Return an empty list if the query does not return any documents.
        """
        # ------------------------------------------------------------------
        # TODO: Implement Boolean retrieval. You will want to use your
        #       inverted index that you created in index().
        # Right now this just returns all the possible documents!
        qsets = {}
        for qword in query:
            qsets[qword] = set()
            
            if qword in self.inv_index:
                for i in self.inv_index[qword]:
                    qsets[qword].add(i)
                    
        #for qword in qsets:
        #    print "word", qword, "set",  qsets[qword] 
            
        # initial set
        final = qsets[query[0]]
        for x in range(1, len(query)):
            final = final.intersection(qsets[query[x]])
        
        #print "final set ",  final
        
        docs = list(final)

        # ------------------------------------------------------------------

        return sorted(docs)   # sorted doesn't actually matter


    def rank_retrieve(self, query):
        """
        Given a query (a list of words), return a rank-ordered list of
        documents (by ID) and score for the query.
        """
        scores = [0.0 for xx in range(len(self.docs))]
        # ------------------------------------------------------------------
        # TODO: Implement cosine similarity between a document and a list of
        #       query words.

        # Right now, this code simply gets the score by taking the Jaccard
        # similarity between the query and every document.
        
        tf = {}        
        
        words_in_query = set()
        for word in query:
            words_in_query.add(word)
            
            if word not in tf:
                tf[word] = 1
            else:
                tf[word] = tf[word]+1
        
        
        #print query, tf

        for d, doc in enumerate(self.docs):
            words_in_doc = set(doc)
            #scores[d] = len(words_in_query.intersection(words_in_doc)) \
            #        / float(len(words_in_query.union(words_in_doc)))
                    
            union = words_in_query.union(words_in_doc)
            #inter = words_in_query.intersection(words_in_doc)
            
#            ltclnn = {}
#            
#            for w in union:
#                ltclnn[w] = {}
#                ltclnn[w]["dn"] = 0
#                ltclnn[w]["qn"] = 0
#                if w in tf:
#                    ltclnn[w]["qwt"] = 1+ math.log10(tf[w])
#                    ltclnn[w]["qn"] = ltclnn[w]["qn"] + ltclnn[w]["qwt"]**2
#                else:
#                    ltclnn[w]["qwt"] = 0
#                    ltclnn[w]["qn"] = 0
#                    
#                ltclnn[w]["dwt"] = self.get_tfidf(w, d)
#                ltclnn[w]["dn"] = ltclnn[w]["dn"] + ltclnn[w]["dwt"]**2
#                
#            for w in ltclnn:
#                ltclnn[w]["qwtn"] = ltclnn[w]["qwt"] / math.sqrt(ltclnn[w]["qn"])
#                ltclnn[w]["dwtn"] = ltclnn[w]["dwt"] / math.sqrt(ltclnn[w]["dn"])
#            
#            prod = 0
#            for w in ltclnn:
#                prod = prod + ltclnn[w]["qwtn"] * ltclnn[w]["dwtn"]
#            
#            scores[d] = prod            
            
            ltc_sum = 0
            #lnn_sum = 0
            ltc_lnn = 0
            for term in union:                
                                
                ltc = self.get_tfidf(term, d)
                
                ltc_sum = ltc_sum + ltc*ltc
                
                if term in tf:
                    lnn = 1 + math.log10(tf[term])
                else:
                    lnn = 0
                    
                #lnn_sum = lnn_sum + lnn*lnn
                ltc_lnn = ltc_lnn + ltc*lnn
            
            scores[d] = ltc_lnn / math.sqrt(ltc_sum)
                
        
        #print scores

        # ------------------------------------------------------------------

        ranking = [idx for idx, sim in sorted(enumerate(scores),
            key = lambda xx : xx[1], reverse = True)]
        results = []
        for i in range(10):
            results.append((ranking[i], scores[ranking[i]]))
        return results


    def process_query(self, query_str):
        """
        Given a query string, process it and return the list of lowercase,
        alphanumeric, stemmed words in the string.
        """
        # make sure everything is lower case
        query = query_str.lower()
        # split on whitespace
        query = query.split()
        # remove non alphanumeric characters
        query = [self.alphanum.sub('', xx) for xx in query]
        # stem words
        query = [self.p.stem(xx) for xx in query]
        return query


    def query_retrieve(self, query_str):
        """
        Given a string, process and then return the list of matching documents
        found by boolean_retrieve().
        """
        query = self.process_query(query_str)
        return self.boolean_retrieve(query)


    def query_rank(self, query_str):
        """
        Given a string, process and then return the list of the top matching
        documents, rank-ordered.
        """
        query = self.process_query(query_str)
        return self.rank_retrieve(query)
Esempio n. 55
0
	def __init__(self,):
		self.stemmer = PorterStemmer()
Esempio n. 56
0
class TextIndex:
    def __init__(self):
        self.index = defaultdict(list)
        self.p = PorterStemmer()
        
    '''get stop words from stopwords file'''
    def getStopWords(self, stopwordsFile):
        f = open(stopwordsFile, 'r')
        stopwords = [line.rstrip() for line in f]
        self.sw = dict.fromkeys(stopwords)
        f.close()

    '''Create an inverted index to store word-document pairs'''        
    def create(self, docList, dirPath, stopwordsFile):

        self.getStopWords(dirPath + stopwordsFile)
        
        for d in docList:
            file = open(dirPath + d)
            pos = 1
            docIndex={}
            for word in file.read().split():
                '''Remove the punctuation marks''' 
                key = word.lower().strip(".")
                if key not in self.sw:
                    '''Use the Porter Stemmer algorithm to stem words.'''
                    key = self.p.stem(key, 0, len(key) - 1)
                    try:
                        docIndex[key][1].append(pos)
                    except:
                        docIndex[key]=[d, array('I',[pos])]                    
                pos += 1

            '''Merge the document index with global index'''
            for docName, positions in docIndex.items():
                self.index[docName].append(positions)
        print(self.index)

    '''Get the query type''' 
    def getQueryType(self, query):
        if '"' in query:
            return 'PQ' 
        elif (len(query.split()) > 1):
            return 'FTQ' 
        else:
            return 'OWQ'
        
    '''Query the Index created above'''
    def queryIndex(self):
        while True:
            q = sys.stdin.readline()
            q = q.rstrip()
            if q == '':
                break

            queryType = self.getQueryType(q)
            if queryType == 'OWQ':
                self.oneWordQuery(q)
            elif queryType == 'FTQ':
                self.freeTextQuery(q)

    '''One Word Query'''
    def oneWordQuery(self, q):
        originalQuery = q
        q = self.p.stem(q, 0, len(q) - 1)

        if len(q) == 0:
            print('Length of q is zero')
            return

        q = "'{}'".format(q)

        print(q)

        '''Query contains only one word'''
        if q not in self.index.keys():
            print('q is not in index')
            return
        else:
            pos = self.index[q]
            pos = [x[0] for x in pos]
            pos = ' '.join(pos)
            print(pos)

    '''Extract words from the free text query '''
    def getTerms(self, line):
        line = line.lower()
        '''replace non alphanumeric characters with space'''
        line = re.sub(r'[^a-z0-9 ]',' ',line)

        line = line.split()
        line = [x for x in line if x not in self.sw]
        line = [self.p.stem(word, 0, len(word) -1) for word in line]
        return line

    '''This function returns the intersection of lists'''
    def intersectsLists(self, lists):
        if len(lists) == 0:
            return []

        '''Sort the list on the basis of length such that smallest item appears first'''
        lists.sort(key=len)
        return list(reduce(lambda x, y: set(x) & set(y), lists))

    def getPostings(self, terms):
        '''all terms in the list are guaranteed to be in the index'''
        return [self.index[term] for term in terms]

    def getDocsFromPostings(self, postings):
        '''no empty list in postings'''
        return [[x[0] for x in p] for p in postings]

    '''Free Text Query'''
    def freeTextQuery(self, q):
        q = self.getTerms(q)

        if len(q)==0:
            print('')
            return

        li = set()
        for term in q:
            try:
                p=self.index[term]
                p=[x[0] for x in p]
                li=li|set(p)
            except:
                #term not in index
                pass

        li = list(li)
        li.sort()
        print(' '.join(li))

    '''Phrase Query'''
    def phraseQuery(self, q):
        originalQuery=q
        q = self.getTerms(q)
        if len(q) == 0:
            print('')
            return
        elif len(q) == 1:
            self.owq(originalQuery)
            return

        phraseDocs = self.phraseQueryDocs(q)

        print(' '.join(map(str, phraseDocs)))



    def phraseQueryDocs(self, termList):
        phraseDocs = []
        length = len(termList)

        '''first find matching docs'''
        for term in termList:
            if term not in self.index:
                '''if a term doesn't appear in the index there can't be any document matching it'''
                return []

        postings = self.getPostings(termList)
        docs = self.getDocsFromPostings(postings)

        '''docs are the documents that contain every term in the query'''
        docs = self.intersectLists(docs)
        '''postings are the postings list of the terms in the documents docs only'''
Esempio n. 57
0
 def __init__(self):
     self.index = defaultdict(list)
     self.p = PorterStemmer()
Esempio n. 58
0
def stem_words(l):
    ps = PorterStemmer()
    return [ps.stem(x, 0, len(x) - 1) for x in l]
Esempio n. 59
0
from PorterStemmer import PorterStemmer
from pprint import pprint
import math

corpus = ["At work", "New job", "Enjoying", "Beer", "Days off", "wedding", "Office", "Drinks", "Wine", "Drinks", "Blessed", "A drink", "Hubby", "Much needed", "New place", "Thankful", "apartment", "Excited about", "Vacation", "Celebrate", "Let me know", "Had a blast", "laundry", "care of", "company", "Grocery", "Wishes", "Drinking for eveveryone", "After work", "To work tommorow", "Bills", "taxes", "Husband", "shift", "The bar", "Potty", "ready to", "Celebrating", "To enjoy", "My babies", "Errands", "Relaxing", "apt", "Fingers crossed", "Poor baby", "Day to all", "women", "Work", "Yard", "Doesn't", "Uni", "Days", "Volunteer", "Schedule", "repeat", "House", "Apartment", "Moving", "place", "Rent", "Move", "Month", "Bedroom", "Lease", "Signed", "Roommate", "Interested", "Complex", "Area", "Interest", "apt", "Drinking", "Beer", "Drink", "Cold", "Root", "Beers", "Pong", "Ale", "Ginger", "Cans", "Drinkin", "ginger", "Pint", "Cans", "Bbq", "Pub", "bottles", "Home", "Work", "Ready", "Hubby", "Bed", "Dinner", "relax", "Shower", "Heading", "Relaxing", "Chill", "Nap", "Early", "Supper", "Snuggle", "Money", "Pay", "Bills", "Paid", "Paying", "Bill", "Job", "Month", "Rent", "Check", "Taxes", "Bucks", "Debt", "paycheck", "job", "Position", "Company", "Interview", "Experience", "Manager", "Assistant", "Interested", "Career", "Business", "Resume", "Sales", "Hiring", "Hire"]
stoplist = set('for a of the and to in'.split())

stemmer = PorterStemmer()

texts = [[word for word in string.lower().split() if word not in stoplist]
			for string in corpus]

words = reduce(list.__add__, texts)

stems = []
for word in words:
	stem = stemmer.stem(word)
	stems.append(stem)

stemCounts = {}

numStems = len(stems)
for word in stems:
	if word not in stemCounts:
		stemCounts[word] = 1.0
	else:
		stemCounts[word] = stemCounts[word] + 1.0


for word in stemCounts:
	stemCounts[word] = stemCounts[word]/numStems;