Beispiel #1
0
 def _createFeatureVect(self, input):
     x = numpy.zeros(len(self._common_words))
     for word in input.x.split():
         word = PorterStemmer().stem(word.lower())
         if word in self._common_words:
             x[self._common_words.index(word)] = 1
     return x
Beispiel #2
0
    def generate_hash_map(self):
        """
        Method who creates a hash map with all word from current file parsed
        All words are firstly parsed to Porter Algorithm and then inserted to hash map
        :return:
        """

        # clear the hash map
        self._hash_map.clear()

        for line in self._document_content:

            line = line.encode('utf-8')

            line = str(line).translate(PUNCTUATION_TRANS)
            words = line.split()

            for word in words:

                word = word.decode('utf-8-sig')
                word = PorterStemmer().stem(word)
                word = word.lower()

                if word.isalpha():
                    if not self._is_stop_word(word):

                        # if the word is not in hash
                        if word not in self._hash_map:
                            self._hash_map[word] = 1
                        else:
                            self._hash_map[word] += 1
Beispiel #3
0
 def _findFeatureVector(self, trainingData):
     for i in range(0, trainingData.m):
         x = numpy.zeros(trainingData.n)
         for word in trainingData.TrainingData[i].x.split():
             word = PorterStemmer().stem(word.lower())
             if word in self._common_words:
                 x[self._common_words.index(word)] = 1
         self._X[i, :] = x
Beispiel #4
0
 def findMostCommonWords(trainingData):
     words = dict()
     for input in trainingData.TrainingData:
         for word in input.x.split():
             word = PorterStemmer().stem(word.lower())
             if word in words:
                 words[word] += 1
             else:
                 words[word] = 1
     sorted_words = sorted(words.items(),
                           key=operator.itemgetter(1),
                           reverse=True)
     return [word[0] for word in sorted_words][:trainingData.n]
    def pre_processing_line(self, line):
        line = cjson.decode(line)
        list_name = line['list_name']
        #split TexasAggies ->Texas Aggies
        list_name = list_name.strip()
        name_split = re.split(r'\W+|_|\d+', list_name)
        #try:
        #    prev_letter = text[0]
        #except:
        #    print text
        new_list = []
        for text in name_split:
            if text == '':
                continue
            words = ''
            try:
                prev_letter = text[0]
            except:
                print text
            word_length = len(text)
            words += prev_letter
            for i in range(1, len(text)):
                cur_letter = text[i]
                if cur_letter.isupper() and prev_letter.islower():
                    words += ' '
                words += cur_letter
                prev_letter = cur_letter
            words = words.split(' ')
            for word in words:
                word = PorterStemmer().stem_word(word)
                new_list.append(word.lower())
        ###################################
        #if no tag exist,neglect this line#
        ###################################
        if new_list == []:
            return 0
        line['tag'] = [word for word in new_list if word not in self.stoplist_]
        #line['tag']=new_list
        #print new_list
        del line['list_name']
        del line['_id']
        #print line

        #    del line['list_name']
        #     print line
        return line
Beispiel #6
0
def query_result(getter_address, search_query, max_distance):

    database_name = "givegetgreen_db"
    conn = sqlite3.connect(database_name)

    address_list = []
    all_fields_list = []
    hits = []
    d = {}
    ix = open_dir("indexdir")
    f = open("search_results.txt", "a")

    query = ""
    search_list = search_query.split(" ")
    for words in search_list:
        words = PorterStemmer().stem(words.lower())
        query = str(query) + str(words) + " OR "
    query = query + " "
    search_query = query
    # print search_query
    with ix.searcher() as searcher:
        query = MultifieldParser(["title", "category", "description"],
                                 schema=ix.schema).parse(search_query)
        # query = QueryParser("description" AND "title", ix.schema).parse(u'latest')
        results = searcher.search(query)
        # print results
        for words in results:
            f.write(str(words) + "\n")
            hits.append(str(words))
            # print words.score, words['address']
            address_list.append(words['address'])

            c = conn.cursor()
            x = int(words['id'])
            x = (x, )
            # print x
            for row in c.execute('SELECT id FROM posting_posting where id= ? ',
                                 x):
                all_fields_list.append(row)
    getter_address = (getter_address.lower())
    conn.commit()
    conn.close()
    return add_filter(getter_address, address_list, all_fields_list,
                      max_distance)
def index(document_directory, dictionary_file):
    # preprocess docID list
    docID_list = []
    i = 1
    for doc in os.listdir(document_directory):
        docID_list.append([doc, i])
        i += 1
    f1 = open("output", "wb")
    pickle.dump(docID_list, f1)
    f1.close()
    stopwords = nltk.corpus.stopwords.words('english')
    docs_indexed = 0  # counter for the number of docs indexed
    dictionary = {
    }  # key: term, value: docIDs containing term (incudes repeats)
    l_no = {}
    length = []
    count = 0
    cnt = 0
    c1 = 0
    word_positions = 0
    # for each document in corpus
    for docID in docID_list:
        if (LIMIT and docs_indexed == LIMIT): break
        file_path = os.path.join(document_directory, str(docID[0]))

        # if valid document
        cnt += 1
        line_no = 1
        if (os.path.isfile(file_path)):
            file = codecs.open(file_path, encoding='utf-8', errors='ignore')
            line = file.readline()
            c = 0
            # for line in document
            while line != '':  # read entire document
                tokens = nltk.word_tokenize(
                    line)  # list of word tokens from document
                # for each term in document
                for word in tokens:
                    word_positions += 1
                    word = PorterStemmer().stem(word)
                    c += 1
                    count += 1
                    term = word.lower()  # casefolding
                    if (IGNORE_STOPWORDS and term in stopwords):
                        continue  # if ignoring stopwords
                    if (IGNORE_NUMBERS and term.isnumeric()):
                        continue  # if ignoring numbers
                    if (term[-1] == "'"):
                        term = term[:-1]  # remove apostrophe
                    if (IGNORE_SINGLES and len(term) == 1):
                        continue  # if ignoring single terms
                    if (len(term) == 0): continue  # ignore empty terms

                    # if term not already in dictionary
                    if (term not in dictionary):
                        dictionary[term] = [int(
                            docID[1])]  # define new term in in dictionary
                        po = [line_no, word_positions]
                        l_no[term] = [po]
                        c1 += 1
                    # else if term is already in dictionary, append docID
                    else:
                        dictionary[term].append(docID[1])
                        po = [line_no, word_positions]
                        l_no[term].append(po)
                line_no += 1
                line = file.readline()

            docs_indexed += 1
            length.append(c)
            file.close()

    f3 = open("len", "wb")
    pickle.dump(l_no, f3)
    f3.close()
    dict_file = codecs.open(dictionary_file, 'w', encoding='utf-8')
    dict_file.write(str(cnt) + '\n')
    ct = count / cnt
    fre = {}
    pr = 0
    score = {}
    for term, docs in dictionary.items():
        for x in range(cnt):
            freq = 0
            n = 0
            for t in dictionary[term]:
                if (x + 1 == t):
                    freq += 1
                if (pr != t and pr != 0):
                    n += 1
                pr = t
            sc = score_BM25(n, freq, cnt, length[x - 1], ct)
            if (term not in score):
                score[term] = [sc]  # define new term in in dictionary
            else:
                score[term].append(sc)
            if (term not in fre):
                fre[term] = [freq]  # define new term in in dictionary
            else:
                fre[term].append(freq)
        dict_file.write(term + " " + str(dictionary[term]) + " " +
                        str(l_no[term]) + "\n")
    # close files
    dict_file.close()
    f2 = open("dict", "wb")
    pickle.dump(score, f2)
    f2.close()
    f4 = open("freq", "wb")
    pickle.dump(fre, f4)
    f4.close()
Beispiel #8
0
def stemming(word):
    word = PorterStemmer().stem_word(word.lower())
    return word
Beispiel #9
0
    def make_wordwheres(self):
        self.wordswhere = " TRUE "
        
        limits = []
        
        if self.word_limits:
            """

            This doesn't currently allow mixing of one and two word searches
            together in a logical way.  It might be possible to just
            join on both the tables in MySQL--I'm not completely sure
            what would happen.  But the philosophy has been to keep
            users from doing those searches as far as possible in any
            case.

            """


            
            for phrase in self.limits['word']:
                locallimits = dict()
                array = phrase.split()
                for n, word in enumerate(array):
                    searchingFor = word
                    if self.word_field == "stem":
                        from nltk import PorterStemmer
                        searchingFor = PorterStemmer().stem_word(searchingFor)
                    if self.word_field == "case_insensitive" or \
                       self.word_field == "Case_Insensitive":
                        # That's a little joke. Get it?
                        searchingFor = searchingFor.lower()

                    
                    selectString = "SELECT wordid FROM %s WHERE %s = %%s" % (self.wordsheap, self.word_field)
                    logging.debug(selectString)
                    cursor = self.db.cursor
                    cursor.execute(selectString,(searchingFor,))

                    # Set the search key being used.
                    search_key = "wordid"
                    if self.gram_size() > 1:
                        # 1-indexed entries in the bigram tables.
                        search_key = "word{}".format(n + 1)
                    
                    for row in cursor.fetchall():
                        wordid = row[0]
                        try:
                            locallimits[search_key] += [wordid]
                        except KeyError:
                            locallimits[search_key] = [wordid]

                if len(locallimits) > 0:
                    limits.append(where_from_hash(locallimits, comp = " = ", escapeStrings=False))
                    

            self.wordswhere = "(" + ' OR '.join(limits) + ")"
            if limits == []:
                # In the case that nothing has been found, tell it explicitly to search for
                # a condition when nothing will be found.
                self.wordswhere = "bookid = -1"

        wordlimits = dict()

        limitlist = copy.deepcopy(list(self.limits.keys()))

        for key in limitlist:
            if re.search("words\d", key):
                wordlimits[key] = self.limits[key]
                self.max_word_length = max(self.max_word_length, 2)
                del self.limits[key]

        if len(list(wordlimits.keys())) > 0:
            self.wordswhere = where_from_hash(wordlimits)

        return self.wordswhere