Ejemplo n.º 1
0
class WindowPorterStemStringFeature(object):
    def __init__(self):
        self._stemmer = PorterStemmer()

    def get_id(self):
        return 'WINDOW-STEM-STRING'

    def _stem(self, token):
        return self._stemmer.stem(token, 0, len(token) - 1)

    def featurise(self, document, sentence, annotation):
        NORMALISE = True

        before_ann = sentence.text[:annotation.start].split()
        before_ann.reverse()
        after_ann = sentence.text[annotation.end:].split()

        to_yield = []
        for i, tok in izip(xrange(1, 4), before_ann):
            to_yield.append((u'-BEFORE-{}-{}'.format(i, self._stem(tok)), 1))
        for i, tok in izip(xrange(1, 4), after_ann):
            to_yield.append((u'-AFTER-{}-{}'.format(i, self._stem(tok)), 1))
        for f_tup in to_yield:
            if NORMALISE:
                yield (f_tup[0], f_tup[1] / float(len(to_yield)))
            else:
                yield f_tup
Ejemplo n.º 2
0
 def stem_words(self, terms):
     """
     Remove the suffixes in terms.
     """
     porter_stemmer = PorterStemmer() #we use the porter stemming algoritm
     terms = [porter_stemmer.stem(word) for word in terms]
     return terms 
Ejemplo n.º 3
0
def main(argv):
    files = os.listdir(sys.argv[1])
    file = files[0]
    stemmed = []
    for file in files:
        text = ""
        infile = open(sys.argv[1] + file)
        a = infile.readline()
        while a:
            text += removeSGML(a)
            a = infile.readline()
        tok = tokenizeText(text)
        removed = removeStopwords(tok)
        from porter import PorterStemmer
        p = PorterStemmer()
        for element in removed:
            stemmed.append(p.stem(element, 0, len(element) - 1))
    print "Words " + str(len(stemmed))
    unique = list(set(stemmed))
    print "Vocabulary " + str(len(unique))
    wordfrequency = [(unique[x], stemmed.count(unique[x]))
                     for x in range(0, len(unique))]
    sort = sorted(wordfrequency, key=getKey, reverse=True)
    for i in range(0, 49):
        print sort[i]
Ejemplo n.º 4
0
class WindowPorterStemStringFeature(object):
    def __init__(self):
        self._stemmer = PorterStemmer()

    def get_id(self):
        return 'WINDOW-STEM-STRING'
    
    def _stem(self, token):
        return self._stemmer.stem(token, 0, len(token) - 1)

    def featurise(self, document, sentence, annotation):
        NORMALISE = True

        before_ann = sentence.text[:annotation.start].split()
        before_ann.reverse()
        after_ann = sentence.text[annotation.end:].split()

        to_yield = []
        for i, tok in izip(xrange(1, 4), before_ann):
            to_yield.append((u'-BEFORE-{}-{}'.format(i, self._stem(tok)), 1))
        for i, tok in izip(xrange(1, 4), after_ann):
            to_yield.append((u'-AFTER-{}-{}'.format(i, self._stem(tok)), 1))
        for f_tup in to_yield:
            if NORMALISE:
                yield (f_tup[0], f_tup[1] / float(len(to_yield)))
            else:
                yield f_tup
Ejemplo n.º 5
0
    def search(self, word):
        # Create an instance of the Porter Stemmer.
        PS = PorterStemmer()

        # Get the information for the supplied word.
        res = self.manage_DB.get_info("index_word", where={"word": PS.stem(word, 0, len(word) - 1)})

        # The supplied word exist in the index_word table.
        if res:
            # Extract the id for the supplied word.
            wordid = res["id"]

            # Get all the entries in the index reference database that refer to
            # the supplied wordid.
            res = self.manage_DB.get_info("index_ref", where={"wordid": wordid})

            # For ever entry in the list.
            for row in res:
                # Modify the current row to contain the stem word.
                row["word"] = self.manage_DB.get_info("index_word", rowid=row[1])["word"]
                # Modify the current row to contain the document name.
                row["doc"] = self.manage_DB.get_info("document", rowid=row[2])["name"]

            # Return the list of all the results.
            return res
        # The supplied word does not exist in the index_word table, so return
        # and empty list.
        else:
            return []
Ejemplo n.º 6
0
 def stem_words(self, terms):
     """
     Remove the suffixes in terms.
     """
     porter_stemmer = PorterStemmer()  #we use the porter stemming algoritm
     terms = [porter_stemmer.stem(word) for word in terms]
     return terms
Ejemplo n.º 7
0
def stemList(list1):
    p = PorterStemmer()
    lAllWordsStemmed = []
    for word in list1:
        word = p.stem(word,0,len(word)-1)
        lAllWordsStemmed.append(word)
    return lAllWordsStemmed
Ejemplo n.º 8
0
def stemWords(input):
    porter = PorterStemmer()
    words = input

    for index, word in enumerate(words):
        words[index] = porter.stem(word, 0, len(word) - 1)
    return words
Ejemplo n.º 9
0
def get_stemmed_words(word_list):
    stemmer = PorterStemmer()
    stemmed_words = set()
    
    for word in word_list:
        stemmed_words = stemmed_words.union(stemmer.stem(word, 0,len(word)-1))
        
    return stemmed_words
def stemWords(tokens):
    ''' 
    input: list of tokens
    output: list of stemmed tokens
    use the porter.py
    '''
    p = PorterStemmer()
    return map(lambda t: p.stem(t, 0, len(t) - 1), tokens)
Ejemplo n.º 11
0
def read_vocabulary(path):
    f = open(path)
    vocab = f.read()
    words = vocab.strip().split(", ")
    vocab = []
    stemmer = PorterStemmer()
    for word in words:
        vocab.append(stemmer.stem(word, 0, len(word)-1))
    return vocab
Ejemplo n.º 12
0
def process(input):
    s2 = tokenizeText(input)
    s3 = removeStopwords(s2)
    pr = []
    from porter import PorterStemmer
    p = PorterStemmer()
    for element in s3:
        pr.append(p.stem(element, 0, len(element) - 1))
    return pr
Ejemplo n.º 13
0
def process(input):
    s3 = tokenizeText(input)
    """s3 = removeStopwords(s2)"""
    pr = []
    from porter import PorterStemmer
    p = PorterStemmer()
    for element in s3:
        pr.append(p.stem(element, 0, len(element)-1))
    return pr
Ejemplo n.º 14
0
    def create_posting_list(self, stopword_toggle, stemming_toggle):
        """
        function to go through all the documents abstracts cleaning
        and adding each term to a posting_list object and the
        term dictionary. removes all the special characters for each
        term. toggles stopwords and stemming accordingly

        Note: all terms are converted to lowercase

        :param stopword_toggle: boolean, toggles the stopword usage
        :param stemming_toggle: boolean, toggles the stemming of words
        """
        self.terms = {}
        self.termsDictionary = {}
        documents = self.documents
        stopwords = []
        if stopword_toggle:
            stopwords = fetch_stopwords()
        for doc_id, document in documents.items():
            if 'abstract' in document:
                for index, word in enumerate(document['abstract'].split(' ')):
                    word = word.rstrip().lower()

                    for a in [',', '.', '{', '}', '(', ')', ';', ':', '"', '\'']:
                        if a in word:
                            if word.index(a) == 0 or word.index(a) == len(word) - 1:
                                word = word.replace(a, '')
                    if stemming_toggle:
                        p = PorterStemmer()
                        word = p.stem(word, 0, len(word) - 1)

                    if word in stopwords:
                        continue

                    if len(word) > 0:
                        if word not in self.terms.keys():
                            self.terms[word] = {}

                        if doc_id not in self.terms[word].keys():
                            self.terms[word][doc_id] = {
                                'frequency': 0,
                                'position': [],
                            }

                        self.terms[word][doc_id]['frequency'] += 1
                        self.terms[word][doc_id]['position'].append(index)

        for term, value in self.terms.items():
            self.termsDictionary[term] = len(value)

        f = open('dictionary.json', 'w')
        f.write(json.dumps(self.termsDictionary, indent=4, sort_keys=True))
        f.close()

        f = open('posting-list.json', 'w')
        f.write(json.dumps(self.terms, indent=4, sort_keys=True))
        f.close()
Ejemplo n.º 15
0
    def index_document(self, docid, path_physical):
        self.manage_DB.delete_references(docid)

        # Get the information for the supplied document.
        document = self.manage_DB.get_info('document', rowid=docid)

        # Open the document for reading.
        fhandle = open('%s%s' % (path_physical, docid), 'r')
        # Create an instance of the Porter Stemmer.
        PS = PorterStemmer()

        # Get the 1st line of the supplied document and force the contents to
        # lowercase.
        content = fhandle.readline().lower()

        # The text widget starts indexing its lines at 1, but columns start
        # indexing at 0.
        line_count = 1

        # While the supplied document has content to be read.
        while content != '':
            # Find all words from the current line of the supplied document
            # and put them in a list.
            words = re.findall('\w+', content)

            # For each word in the list of words from the current line.
            for word in words:
                # Only words whose length is greater than 3 will be indexed.
                if len(word) > 3:
                    # Check for the word in the list of stop words.
                    res = self.manage_DB.get_info('stop_words', where={
                        'word': word})

                    # If the word does not exist in the list of stop words:
                    if not res:
                        # The column of the current word is its index in the
                        # current line.
                        col_count = content.find(word) + 1
                        # Using the PorterStemmer, find the root of the current
                        # word. Add the root word, with the current line and
                        # column number to the index.
                        self.add_index_word(
                            PS.stem(word, 0, len(word) - 1),
                            docid,
                            line_count,
                            col_count,
                            word)
            # Get the next line of the supplied document and force the
            # contents to lowercase.
            content = fhandle.readline().lower()
            # Increment the line count.
            line_count += 1

        # Close the supplied document file.
        fhandle.close()
        return
Ejemplo n.º 16
0
def stem(tokens):
    """
    receive tokens
    return stemmedTokens
    """
    stemmedTokens = []
    stemmer = PorterStemmer()
    for token in tokens:
        stemmedTokens.append(stemmer.stem(token, 0, len(token) - 1))

    return stemmedTokens
Ejemplo n.º 17
0
def tokenize(inputStr):
	tokenPattern = re.compile(r'[^a-zA-Z0-9.,_]')	
#	tokenPattern = re.compile(r'[\s:?;()\[\]&!*@#$%+<>/\\\'\"]|\.(\.)+|(-)+')
	primordialTokens = re.split(tokenPattern, inputStr)
#	primordialTokens = inputStr.replace(">", " ").replace("...", " ").replace("-"," ").replace("'"," ").replace("/"," ").split(' ')
	stripPuncTokens = [x.strip(',.').replace(",","").lower() for x in primordialTokens if x != None]
	stripPuncTokens = [x for x in stripPuncTokens if x != '' and x not in stop_words]

	#stemming
	p = PorterStemmer()
	stemmedTokens = [p.stem(x, 0, len(x)-1) for x in stripPuncTokens]
	return stemmedTokens
Ejemplo n.º 18
0
class StringPorterStemFeature(object):
    def __init__(self):
        self._stemmer = PorterStemmer()

    def _stem(self, token):
        return self._stemmer.stem(token, 0, len(token) - 1)
        return self._stemmer.stem(token, 0, len(token) - 1)

    def get_id(self):
        return 'STRING-STEM'

    def featurise(self, document, sentence, annotation):
        yield (self._stem(sentence.annotation_text(annotation)), 1)
Ejemplo n.º 19
0
class SentencePorterStemStringFeature(object):
    def __init__(self):
        self._stemmer = PorterStemmer()

    def _stem(self, token):
        return self._stemmer.stem(token, 0, len(token) - 1)

    def get_id(self):
        return 'STRING-SENTENCE-STEM'

    def featurise(self, document, sentence, annotation):
        for token in sentence.text.split():
            yield (self._stem(token), 1)
Ejemplo n.º 20
0
class StringPorterStemFeature(object):
    def __init__(self):
        self._stemmer = PorterStemmer()
        
    def _stem(self, token):
        return self._stemmer.stem(token, 0, len(token) - 1)
        return self._stemmer.stem(token, 0, len(token) - 1)

    def get_id(self):
        return 'STRING-STEM'

    def featurise(self, document, sentence, annotation):
        yield (self._stem(sentence.annotation_text(annotation)), 1)
Ejemplo n.º 21
0
class SentencePorterStemStringFeature(object):
    def __init__(self):
        self._stemmer = PorterStemmer()

    def _stem(self, token):
        return self._stemmer.stem(token, 0, len(token) - 1)

    def get_id(self):
        return 'STRING-SENTENCE-STEM'

    def featurise(self, document, sentence, annotation):
        for token in sentence.text.split():
            yield (self._stem(token), 1)
Ejemplo n.º 22
0
def porter(text):
    p = PorterStemmer()
    output = ''
    word = ''
    line = text.split('\n')
    for c in line:
        if c.isalpha():
            word += c.lower()
        else:
            if word:
                output += p.stem(word, 0, len(word) - 1)
                word = ''
            output += c.lower()
    return output.split()
Ejemplo n.º 23
0
  def add(self, text, fname, stem=False):
    """Add a string of text to the corpus by first splitting it into features
    defined by WORD_PAT, and then removing stop words.

    Takes a string as its argument."""
    for match in re.finditer(self.WORD_PATT, text):
      if match:
        word = match.group(0).lower()
        if word in self.STOPWORDS:
          self.removed.append(word)
          self.words.add_word(word, fname)
          continue
        if stem:
          p = PorterStemmer()
          word = p.stem(word, 0, len(word)-1)
Ejemplo n.º 24
0
def revise_documents(docs, vocab):
    stemmer = PorterStemmer()
    senses = {}     # {reference:sense}
    for ref, text in docs.items():
        words = re.findall(r"[\w']+", text)
        word_list = []
        for w in words:
            if w == "tag":
                continue
            if w.isdigit() and int(w) > 100000:
                senses[ref] = w
                continue
            if w in vocab:
                word_list.append(stemmer.stem(w.lower(), 0, len(w) - 1))
        docs[ref] = word_list
    return docs, senses
Ejemplo n.º 25
0
    def score_query(self, query, word_matrix, normalized_matrix,
                    stop_words_list, title_vocabulary_dict):
        porter_stemmer = PorterStemmer()
        square_sum = 0
        words = {}

        for word in query:
            word_without_punctuation = word.strip(string.punctuation).replace(
                " ", "").lower()
            if word_without_punctuation not in stop_words_list:
                stemmed_word = porter_stemmer.stem(
                    word_without_punctuation, 0,
                    len(word_without_punctuation) - 1)
                if stemmed_word not in words:
                    words[stemmed_word] = {}
                    words[stemmed_word]['repetitions'] = 0
                words[stemmed_word]['repetitions'] += 1

        for word, elements in words.items():
            square_sum += math.pow(elements['repetitions'], 2)
        for word, elements in words.items():
            if word in word_matrix:
                words[word]['normalized'] = words[word][
                    'repetitions'] / math.sqrt(square_sum)
                words[word]['weight'] = words[word][
                    'normalized'] * word_matrix[word]['idf']
            else:
                words[word]['normalized'] = 0
                words[word]['weight'] = 0
        aggregate_scores = {}
        title_addition_performed = []
        for word, elements in words.items():
            if word in normalized_matrix:
                for doc_id, doc_weight in normalized_matrix[word].items():
                    if doc_id not in aggregate_scores:
                        aggregate_scores[doc_id] = 0
                    aggregate_scores[doc_id] += doc_weight * elements['weight']
                    if word in title_vocabulary_dict:
                        if doc_id in title_vocabulary_dict[
                                word] and doc_id not in title_addition_performed:
                            aggregate_scores[doc_id] += 0.5
                            title_addition_performed.append(doc_id)
        return aggregate_scores
Ejemplo n.º 26
0
    def search(self, word):
        # Create an instance of the Porter Stemmer.
        PS = PorterStemmer()

        # Get the information for the supplied word.
        res = self.manage_DB.get_index_word_info(
            PS.stem(word, 0, len(word) - 1))

        # The supplied word exist in the index_word table.
        if res:
            # Extract the id for the supplied word.
            wordid = res['id']

            # Return the found entries as a list.
            res = []

            # Query the index_ref table for all the entries whose wordid
            # match the supplied word's id.
            self.c.execute("""select * from index_ref where wordid=?""",
                (wordid,))

            # Retrieve all the results of the query as a list.
            entries = self.c.fetchall()

            # For ever entry in the list.
            for row in entries:
                # Create a dictionary with the results and add the dictionary
                # to the list.
                res.append({
                    'id': row[0],
                    'word': self.manage_DB.get_index_word_info(row[1])['word'],
                    'docid': row[2],
                    'doc': self.manage_DB.get_document_info(row[2])['name'],
                    'line': row[3], 'column': row[4],
                    'branch_word': row[5]})

            # Return the list of all the results.
            return res
        # The supplied word does not exist in the index_word table, so return
        # and empty list.
        else:
            return []
Ejemplo n.º 27
0
def tokenize_on_porter(text):
    word_list = []
    p = PorterStemmer()
    outfile = open('out3', 'w')
    for line in text.splitlines():
        output = ''
        word = ''
        if line != '':
            for c in line:
                if c.isalpha():
                    word += c.lower()
                else:
                    if word:
                        word_stem = p.stem(word, 0, len(word)-1)
                        output += word_stem
                        word_list.append(word_stem)
                        word = ''
                    output += c.lower()
        print(output, end='\n', file=outfile)
    outfile.close()
    return word_list
Ejemplo n.º 28
0
    def make_cloud(self):
        stemdict, tempdict, finaldict = {}, {}, {}
        stopwords = open('stopwords.txt', 'r').read().split('\n')

        # Extract just the words inside quotes
        quotes = ' '.join(self.extract_quotes())
        wordlist = re.split('\s+', quotes.lower())
        
        p = PorterStemmer()
        punctuation = re.compile(r'[.?!,":;-]')

        # Stem all of the words in the word list using the Porter Stemmer
        for w in wordlist:
            w = punctuation.sub('', w)
            s = p.stem(w, 0,len(w)-1)
            try:
                tempdict[w] += 1
            except:
                tempdict[w] = 1
            stemdict.setdefault(s,{}).update({w:tempdict[w]})
        
        cumfreq = 0

        # Calculate the cumulative frequencies of the stemmed words
        for k, v in stemdict.items():
            for l, m in v.items():
                cumfreq = cumfreq + m
            items = v.items()
            items.sort(lambda x, y: cmp(y[1], x[1]))
            finaldict[items[0][0]] = cumfreq
            cumfreq = 0

        # Remove stopwords like "the", "it", "a", etc.
        for word in stopwords:
            try:
                del finaldict[word]
            except: pass

        results = self.process_cloud(8, finaldict.items()[:50])
        return results
Ejemplo n.º 29
0
    def search(self, word, docid=None):
        # Create an instance of the Porter Stemmer.
        PS = PorterStemmer()

        # Get the information for the supplied word.
        res = self.manage_DB.get_info('index_word', where={
            'word': PS.stem(word, 0, len(word) - 1)})

        # The supplied word exist in the index_word table.
        if res:
            # Extract the id for the supplied word.
            wordid = res[0]['id']

            if docid:
                # Get all the entries in the index reference database that refer to
                # the supplied wordid.
                res = self.manage_DB.get_info('index_ref', where={
                    'wordid': wordid, 'docid': docid})
            else:
                # Get all the entries in the index reference database that refer to
                # the supplied wordid.
                res = self.manage_DB.get_info('index_ref', where={
                    'wordid': wordid})

            # For ever entry in the list.
            for row in res:
                # Modify the current row to contain the stem word.
                row['word'] =  self.manage_DB.get_info(
                    'index_word', rowid=row['wordid'])['word']
                # Modify the current row to contain the document name.
                row['doc'] = self.manage_DB.get_info(
                    'document', rowid=row['docid'])['name']

            # Return the list of all the results.
            return res
        # The supplied word does not exist in the index_word table, so return
        # and empty list.
        else:
            return []
Ejemplo n.º 30
0
def getDocStuff(dDocProps):
    lAllLists = []

    if (constants.T in dDocProps):
        lAllLists.append(dDocProps[constants.T])
        putinDPLace("1",dDocProps[constants.T])
    if (constants.W in dDocProps):
        lAllLists.append(dDocProps[constants.W])
        putinDPLace("2",dDocProps[constants.W])
    if (constants.A in dDocProps):
        lAllLists.append(dDocProps[constants.A])
        putinDPLace("3",dDocProps[constants.A])

    lAllLines = []
    for lList in lAllLists:
        lAllLines.extend(lList)
    
    lAllWords = []
    for sLine in lAllLines:
        sLine = re.sub('[^a-zA-Z0-9]', ' ', sLine)
        lWords = sLine.lower().split()
        lAllWords.extend(lWords)
    lw = copy.deepcopy(lAllWords)
    lAllWords = helperFunctions.remStopWords(lAllWords)

    p = PorterStemmer()
    lAllWordsStemmed = []
    for word in lAllWords:
        word = p.stem(word,0,len(word)-1)
        lAllWordsStemmed.append(word)

    lUniqueWords = list(set(lAllWordsStemmed))
    lenAllWords = len(lAllWordsStemmed)
    constants.allDocsLen = constants.allDocsLen+lenAllWords
    sRet = helperFunctions.makeFixedLengthStr(len(lAllWordsStemmed),constants.docWordCntLen)+constants.space+helperFunctions.makeFixedLengthStr(len(lUniqueWords),constants.docWordCntLen)+constants.newLine

    return [sRet,lAllWordsStemmed," ".join(lw)]
Ejemplo n.º 31
0
    def process_query(self, query):
        all_doc_count = len(self.invert.documents.keys())
        query_array = [x.lower() for x in query.split(' ')]
        query_weights = {}
        stopwords = []
        if self.stopword_toggle:
            stopwords = fetch_stopwords()
        while query_array:
            word = query_array.pop(0)
            frequency = 1

            for a in [',', '.', '{', '}', '(', ')', ';', ':', '"', '\'']:
                if a in word:
                    if word.index(a) == 0 or word.index(a) == len(word) - 1:
                        word = word.replace(a, '')

            while word in query_array:
                query_array.pop(query_array.index(word))
                frequency += 1

            if self.stemming_toggle:
                p = PorterStemmer()
                word = p.stem(word, 0, len(word) - 1)

            if word in stopwords:
                continue

            term_weight = 0
            if word in self.invert.termsDictionary.keys():
                document_frequency = self.invert.termsDictionary[word]
                idf = math.log(all_doc_count / document_frequency)
                term_frequency = 1 + math.log(frequency)
                term_weight = idf * term_frequency

            query_weights[word] = term_weight
        return query_weights
Ejemplo n.º 32
0
def getDocStuff(dDocProps):
    global T,W,B,A,N,I
    lAllLists = []
    if (T in dDocProps):
        lAllLists.append(dDocProps[T])
    if (W in dDocProps):
        lAllLists.append(dDocProps[W])
    #if (B in dDocProps):
    #    lAllLists.append(dDocProps[B])
    if (A in dDocProps):
        lAllLists.append(dDocProps[A])
    #if (N in dDocProps):
    #    lAllLists.append(dDocProps[N])

    lAllLines = []
    for lList in lAllLists:
        lAllLines.extend(lList)
    
    lAllWords = []
    for sLine in lAllLines:
        lWords = sLine.split()
        lAllWords.extend(lWords)

    lAllWords = helperFunctions.remStopWords(lAllWords)

    p = PorterStemmer()
    lAllWordsStemmed = []
    for word in lAllWords:
        word = p.stem(word,0,len(word)-1)
        lAllWordsStemmed.append(word)
    #print("All words :", lAllWordsStemmed,"\n")
    lUniqueWords = list(set(lAllWordsStemmed))
    lenAllWords = len(lAllWordsStemmed)
    lenAllWords
    sRet = makeFixedLengthStr(len(lAllWordsStemmed),6)+" "+makeFixedLengthStr(len(lUniqueWords),6) #+":"+dDocProps[B][0]
    return [sRet,lAllWordsStemmed]
Ejemplo n.º 33
0
def main(argv):
    files = os.listdir(sys.argv[1])
    file = files[0]
    stemmed = []
    for file in files:
        text = ""
        infile = open(sys.argv[1] + file)
        a = infile.readline()
        while a:
            text += removeSGML(a)
            a = infile.readline()
        tok = tokenizeText(text)
        removed = removeStopwords(tok)
        from porter import PorterStemmer
        p = PorterStemmer()
        for element in removed:
            stemmed.append(p.stem(element, 0, len(element)-1))
    print "Words " + str(len(stemmed))
    unique = list(set(stemmed))
    print "Vocabulary " + str(len(unique))
    wordfrequency = [(unique[x], stemmed.count(unique[x])) for x in range(0,len(unique))]
    sort = sorted(wordfrequency, key = getKey, reverse = True)
    for i in range(0,49):
        print sort[i]
Ejemplo n.º 34
0
print("dataset imported")

import re

import nltk

nltk.download('stopwords')

# to remove stopword
from nltk.corpus import stopwords

# for Stemming propose
#from nltk.stem.porter import PorterStemmer
from porter import PorterStemmer
p = PorterStemmer()
p.stem("Alcoholic")

# Initialize empty array
# to append clean text
corpus = []
for i in range(0, 1000):
    # column : "Review", row ith
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    # convert all cases to lower cases
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()

    # loop for stemming each word
    # in string array at ith row
    review = [
Ejemplo n.º 35
0
dataFile = open("temp1.txt", "w")
comDataFile = open("com.txt", "w")
testFile = open("output.txt", "w")
stopWord=open("stopwords.txt").read()
stopWord=stopWord.split("\n")
stemmer= PorterStemmer()
countPo=0
countSo=0
trainingSet=f.readlines()
testingSet=v.readlines()
trainingSet=trainingSet
testingSet=testingSet

#initialize the stemmer object for (optional) stemming later
stemmer= PorterStemmer()
stopWord=stemmer.stem(stopWord,0,len(stopWord)-1)

def getCleanString(string):
        """
                fix the string for best results
		the cleaning involve 
		(
			remove all the special character except _ and - ,
			convert upper case to lower case letter
			stemmering "remove [word with]ing|s|ed...etc"
		)
        """
        string=re.sub(r'([^a-zA-Z\-\_])|(http.+)','',string)
        string=string.lower()
        string=stemmer.stem(string,0,len(string)-1)
        return string;
Ejemplo n.º 36
0
def stem(word):
    p = PorterStemmer()
    return p.stem(word, 0, len(word) - 1)
Ejemplo n.º 37
0
def test():
    stem = input("Was the stemmer used in the inversion? (Y/N)")
    return_times = []
    g = open("postings.txt", "r")
    content = g.read().replace('\n', ' ')
    post_list = json.loads("[" + content[:-2] + "]")
    h = open("cacm.all", "r")
    lines = h.readlines()

    if g.mode == 'r' and h.mode == 'r':
        word = ""
        while word != "zzend":
            word = input("Enter a term to search for: ").lower()
            if stem == "Y":
                p = PorterStemmer()
                word = p.stem(word, 0, len(word) - 1)

            found_word = False
            start = timer()
            for elem in post_list:
                if word == elem[0]:
                    found_word = True
                    print("\nThis term is found in " + str(len(elem[1])) +
                          " documents.")
                    print(
                        "============================================================================="
                    )
                    break
            if found_word:
                print(
                    "This search term is found in the following documents:\n")
                # output all docs that contain that term: DocID, title, TF, all the positions, first occurrence with 10
                # words
                docdata = []
                for entry in post_list:
                    if entry[0] == word:
                        docdata += entry[1]
                        break
                # docdata now has doc ID, TF, and positions for each document input_txt appears in
                # now search in cacm for word data
                count = 0
                get_title = False
                abstract_bool = False
                abstract_text = ""
                title = ""
                output = ""
                found = False
                for line in lines:
                    if count == len(docdata):
                        break
                    if line.startswith(".I " + str(docdata[count][0])):
                        found = True
                    if line == ".B\n" and found:
                        get_title = False
                        abstract_bool = False
                        found = False
                        # I need to create the output string here, as its all going to be reset now.
                        output += "Document " + str(docdata[count][0]) + " - " + title + "Term frequency: " + \
                                  str(docdata[count][1]) + "\nList of positions: " + str(docdata[count][2]) + \
                                  "\nFirst occurrence in document: " + \
                                  getcontext(title + abstract_text, docdata[count][2][0]) + "\n" + "------------" + "\n"
                        title = ""
                        abstract_text = ""
                        count += 1
                    if abstract_bool:
                        abstract_text += line
                    if line == ".W\n" and found:
                        get_title = False
                        abstract_bool = True
                    if get_title:
                        title += line
                    if line == ".T\n" and found:
                        get_title = True

                end = timer()
                elapsed_time = (end - start)
                if found_word:
                    return_times += [elapsed_time]
                print(output)
                print("Search time: " + str(elapsed_time) + " seconds\n")

                # output time to results
            elif word != "zzend":
                print("Term not found in any documents")
        shutdown(return_times)
        g.close()
        h.close()
    else:
        print("Error opening file. Try again.")
Ejemplo n.º 38
0
def processEmail(email_contents):
    #PROCESSEMAIL preprocesses a the body of an email and
    #returns a list of word_indices
    #   word_indices = PROCESSEMAIL(email_contents) preprocesses
    #   the body of an email and returns a list of indices of the
    #   words contained in the email.
    #

    # Load Vocabulary
    vocab = getVocabDict()

    # Init return value
    word_indices = []

    # ========================== Preprocess Email ===========================

    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers

    # hdrstart = email_contents.find('\n\n')
    # email_contents = email_contents[hdrstart+2:]

    # Lower case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with >
    # and does not have any < or > in the tag and replace it with a space
    email_contents = re.sub('<[^<>]+>', ' ', email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.sub('[0-9]+', 'number', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr',
                            email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents)

    # Handle $ sign
    email_contents = re.sub('[$]+', 'dollar', email_contents)

    # ========================== Tokenize Email ===========================

    # Output the email to screen as well
    print '\n==== Processed Email ====\n'

    # Process file
    l = 0
    porterStemmer = PorterStemmer()
    # Tokenize and also get rid of any punctuation
    sep = '[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\},\'\"\>\_\<\;\%\n\r]+'
    for s in re.split(sep, email_contents):
        # Remove any non alphanumeric characters
        s = re.sub('[^a-zA-Z0-9]', '', s)

        # Stem the word
        s = porterStemmer.stem(s.strip())

        # Skip the word if it is too short
        if len(s) < 1:
            continue

        # Look up the word in the dictionary and add to word_indices if
        # found
        # ====================== YOUR CODE HERE ======================
        # Instructions: Fill in this function to add the index of str to
        #               word_indices if it is in the vocabulary. At this point
        #               of the code, you have a stemmed word from the email in
        #               the variable s. You should look up s in the
        #               vocabulary dictionary (vocab). If a match exists, you
        #               should add the index of the word to the word_indices
        #               vector. Concretely, if s = 'action', then you should
        #               add to word_indices the value under the key 'action'
        #               in vocab. For example, if vocab['action'] = 18, then,
        #               you should add 18 to the word_indices vector
        #               (e.g., word_indices.append(18) ).
        #

        # =============================================================

        # Print to screen, ensuring that the output lines are not too long
        if l + len(s) + 1 > 78:
            print
            l = 0
        print s,
        l += len(s) + 1

    # Print footer
    print '\n========================='

    return array(word_indices)
Ejemplo n.º 39
0
dataFile = open("temp1.txt", "w")
comDataFile = open("com.txt", "w")
testFile = open("output.txt", "w")
stopWord = open("stopwords.txt").read()
stopWord = stopWord.split("\n")
stemmer = PorterStemmer()
countPo = 0
countSo = 0
trainingSet = f.readlines()
testingSet = v.readlines()
trainingSet = trainingSet
testingSet = testingSet

#initialize the stemmer object for (optional) stemming later
stemmer = PorterStemmer()
stopWord = stemmer.stem(stopWord, 0, len(stopWord) - 1)


def getCleanString(string):
    """
                fix the string for best results
		the cleaning involve 
		(
			remove all the special character except _ and - ,
			convert upper case to lower case letter
			stemmering "remove [word with]ing|s|ed...etc"
		)
        """
    string = re.sub(r'([^a-zA-Z\-\_])|(http.+)', '', string)
    string = string.lower()
    string = stemmer.stem(string, 0, len(string) - 1)
Ejemplo n.º 40
0
class Tokenizer():
    def __init__(self, PATH_TO_STOP_WORDS):
        print("[Tokenizer] Instantiated!")
        self.PATH_TO_STOP_WORDS = PATH_TO_STOP_WORDS
        self.STOP_WORDS = self.load_stopwords()
        self.PorterStemmer = PorterStemmer()

    """
  Tokenizes on these rules:
    SPLIT INTO TOKENS
    STRIP TOKENS' WHITESPACES, NEWLINES AND PUNCTUATIONS DANGLING IN BETWEEN
    STEM EVERY TOKEN
    REMOVE TOKEN IF IS STOP WORD

  Returns list of text normalized tokens
  """

    def tokenize(self, input_str):
        result = []
        # input_str_list = input_str.split()
        input_str_list = re.split('\W+', input_str)

        for token in input_str_list:
            result_tok = token.strip(PUNCTUATIONS)
            if len(result_tok) > 1 and \
               not self.is_stopword(result_tok.lower()) and \
               not self.isMixedNumeric(result_tok):

                result_tok = self.stem(result_tok)
                result.append(result_tok.lower())

        return result

    def stem(self, word):
        return self.PorterStemmer.stem(word, 0, len(word) - 1)

    def remove_stopwords(self, tokens):
        return list(filter(lambda tok: tok not in self.STOP_WORDS, tokens))

    def is_stopword(self, token):
        return self.STOP_WORDS.get(token)

    #===========================================================================#
    # STRING MANIPULATION FUNCS
    #===========================================================================#
    """
  Split on 1st whitespace from back
  """

    def split_on_whitespace_from_back(self, input_str):
        return input_str.rsplit(' ', 1)

    """
  Split on 1st '/' from back
  """

    def split_on_slash_from_back(self, input_str):
        return input_str.rsplit('/', 1)

    """
  Trim newline char from an input string
  """

    def strip_newline(self, input_str):
        return input_str.strip('\n')

    """
  Determines whether an input string has the RegEx given in this function
  A RegEx match object will be returned if a complete match occurs
  """

    def isMixedNumeric(self, input_str):
        pattern = r'([0-9]+[!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]*)+'
        return re.match(pattern, input_str)

    #===========================================================================#
    # SETUP
    #===========================================================================#
    def load_stopwords(self):
        f = open(self.PATH_TO_STOP_WORDS, 'r')
        stopwords = f.read().splitlines()

        stopword_dict = {}
        for word in stopwords:
            stopword_dict[word] = True

        return stopword_dict
Ejemplo n.º 41
0
def processEmail(email_contents):
    #PROCESSEMAIL preprocesses a the body of an email and
    #returns a list of word_indices
    #   word_indices = PROCESSEMAIL(email_contents) preprocesses
    #   the body of an email and returns a list of indices of the
    #   words contained in the email.
    #

    # Load Vocabulary
    vocab = getVocabDict()

    # Init return value
    word_indices = []

    # ========================== Preprocess Email ===========================

    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers

    # hdrstart = email_contents.find('\n\n')
    # email_contents = email_contents[hdrstart+2:]

    # Lower case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with >
    # and does not have any < or > in the tag and replace it with a space
    email_contents = re.sub('<[^<>]+>', ' ', email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.sub('[0-9]+', 'number', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents)

    # Handle $ sign
    email_contents = re.sub('[$]+', 'dollar', email_contents)


    # ========================== Tokenize Email ===========================

    # Output the email to screen as well
    print '\n==== Processed Email ====\n'

    # Process file
    l = 0
    porterStemmer = PorterStemmer()
    # Tokenize and also get rid of any punctuation
    sep = '[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\},\'\"\>\_\<\;\%\n\r]+'
    for s in re.split(sep, email_contents):
        # Remove any non alphanumeric characters
        s = re.sub('[^a-zA-Z0-9]', '', s)

        # Stem the word
        s = porterStemmer.stem(s.strip())

        # Skip the word if it is too short
        if len(s) < 1:
           continue

        # Look up the word in the dictionary and add to word_indices if
        # found
        # ====================== YOUR CODE HERE ======================
        # Instructions: Fill in this function to add the index of str to
        #               word_indices if it is in the vocabulary. At this point
        #               of the code, you have a stemmed word from the email in
        #               the variable s. You should look up s in the
        #               vocabulary dictionary (vocab). If a match exists, you
        #               should add the index of the word to the word_indices
        #               vector. Concretely, if s = 'action', then you should
        #               add to word_indices the value under the key 'action'
        #               in vocab. For example, if vocab['action'] = 18, then,
        #               you should add 18 to the word_indices vector
        #               (e.g., word_indices.append(18) ).
        #




        # =============================================================


        # Print to screen, ensuring that the output lines are not too long
        if l + len(s) + 1 > 78:
            print
            l = 0
        print s,
        l += len(s) + 1

    # Print footer
    print '\n========================='

    return array(word_indices)
Ejemplo n.º 42
0
def lookup(user_input, CLI, K):
    use_stem = False
    stop_words = False
    g = open("postings.txt", "r")
    f = open("cacm.all", "r")
    content = g.read().replace('\n', ' ')
    if content[0] == "1":
        use_stem = True
    if content[1] == "1":
        stop_words = True
    post_list = json.loads("[" + content[2:-2] + "]")
    lines = f.readlines()
    f.close()
    extracted_postings = []
    docs = []
    final_list = []

    if g.mode == 'r':
        # get query
        og_query = user_input.lower()
        og_query = re.sub('[\-]+', ' ', og_query)
        og_query = re.sub('[^A-Za-z0-9$ ]+', '', og_query)
        newquery = og_query.split()

        if stop_words:
            temp = []
            stop_words = open("stopwords.txt", "r").read().split('\n')
            for i in range(len(stop_words)):
                stop_words[i] = stop_words[i].lower()
            for word in newquery:
                if word not in stop_words:
                    temp.append(word)
            newquery = temp
        if use_stem:
            stemmed_query = ""
            for word in newquery:
                p = PorterStemmer()
                word = p.stem(word, 0, len(word) - 1)
                stemmed_query += word + " "
            newquery = stemmed_query.split()

        newquery.sort()
        term_list = get_term_lists(newquery, post_list)
        # remove duplicates if they exist
        term_list = list(dict.fromkeys(term_list))

        for entry in term_list:
            extracted_postings.append(post_list[entry])
        # get docs out of extracted postings
        for posting in extracted_postings:
            for entry in posting[1]:
                docs.append(entry[0])
        docs = list(dict.fromkeys(docs))
        docs.sort()
        document_vectors = get_doc_vector(docs, lines, use_stem, stop_words)
        # print("Relevant document vectors created. Now calculating cosine similarity")
        # now, make all of those vectors have tf values, and then weights
        cosine_list = fill_vectors(document_vectors, og_query, docs)
        temp_list = []
        for i in range(len(docs)):
            temp_list.append([docs[i], cosine_list[i]])
        temp_list.sort(key=lambda x: x[1])
        temp_list.reverse()
        if CLI:
            print("Query was: " + user_input + "\n")
            display(temp_list, get_doc_info(docs, lines))
        for elem in temp_list:
            final_list.append(elem[0])
        if K is None:
            return final_list
        else:
            return final_list[:K]
Ejemplo n.º 43
0
class InfoRetrieval:
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.data = []
        self.db = JokerDatabase()
        self.db.connect()
        self.total_docs = 0

    def restore_persisted_state(self):
        state = self.db.restore_state()
        self.total_docs = len(state.docs)
        self.data.append(state)
        self.calculate_idfs(state)

    def stem_words(self, data):
        for key, value in data.docs.iteritems():
            stemmed_words = []
            for word in value.words:
                stemmed_words.append(self.stemmer.stem(word, 0, len(word)-1))
            value.words = stemmed_words

    def calculate_query_idf(self, query):
        idf_dict = {}
        total = self.total_docs
        
        for word in query.split(' '):
            doc_ct = 0
            for data2 in self.data:
                for key2, value2 in data2.docs.iteritems():
                    for word2 in value2.words:
                        if word2 == word:
                            doc_ct += 1
                            break
            if doc_ct == 0:
                idf_dict[word] = 0
            else:
                idf_dict[word] = float(math.log((int(total)/int(doc_ct)), 2))
        
        return idf_dict

    def calculate_query_tf(self, query):
        freq_dict = {}
        tf_dict = {}
        total_words = len(query.split(' '))

        for word in query.split(' '):
            freq_dict[word] = 0
 
        for word in query.split(' '):
            freq_dict[word] += 1
        
        for key, value in freq_dict.iteritems():
            freq_dict[key] = value / total_words

        return freq_dict 

    def calculate_idfs(self, data):
        total = self.total_docs
        
        for key1, value1 in data.docs.iteritems():
            for word1 in value1.words:
                doc_ct = 0
                for data2 in self.data:
                    for key2, value2 in data2.docs.iteritems():
                        for word2 in value2.words:
                            if word2 == word1:
                                doc_ct += 1
                                break

                value1.terms_idf[word1] = math.log10(int(total)/int(doc_ct))

    def do_clear(self):
        for dfile in self.data:
            for key in dfile.docs.iterkeys():
                self.db.remove_id(key)

    def do_print(self, docid):
        found = False
        for dfile in self.data:
            if dfile.docs.has_key(str(docid)):
                found = True
                print dfile.docs[str(docid)].document.text

        if found == False:
            print "Document not found."

    def do_read(self, filename):
        data = JokerData(filename) 
        data.parse_docs()

        self.stem_words(data)
        self.data.append(data)
        
        count = 0 
        for dfile in self.data:
           for doc in dfile.docs:
               count += 1
        self.total_docs = count
 
        self.calculate_idfs(data)
        self.db.persist_docs(data)
       

    def do_list(self):
        index = 0
        for dfile in self.data:
            print index, ":", dfile.filename
            for key in dfile.docs.iterkeys():
                print "    ", key 

    def do_show(self, docid):
        found = False
        for dfile in self.data:
            if dfile.docs.has_key(str(docid)):
                found = True
                print "\nWords:"
                print dfile.docs[str(docid)].words
                print "\nTerm Freqs:"
                print dfile.docs[str(docid)].terms_freq
                print "\nIDFs:"
                print dfile.docs[str(docid)].terms_idf

        if found == False:
            print "Document not found."

    def do_sim(self, docID1, docID2):
        doc1 = 0
        doc2 = 0
        for dfile in self.data:
            for key, value in dfile.docs.iteritems():
                if key == docID1:
                    doc1 = value
                elif key == docID2: 
                    doc2 = value

        if doc1 == 0 or doc2 == 0:
            print "Error invalid docID"
            return    
            
        doc1_wgts = doc1.tf_idf()
        doc2_wgts = doc2.tf_idf()
        
        sim = 0
        for key1, value1 in doc1_wgts.iteritems():
            for key2, value2 in doc2_wgts.iteritems():
                if key2 == key1:
                    sim += value1
                    sim += value2        
        print "Sim: ", sim
        return sim

    def do_search(self, query):
        tfs  = self.calculate_query_tf(query)
        idfs = self.calculate_query_idf(query)
        tf_idfs = {}

        for key, value in tfs.iteritems():
            tf_idfs[key] = value * idfs[key]

        sims = {} 
        for dfile in self.data:
            for key, value in dfile.docs.iteritems():
                sims[key] = self.query_similarity(tf_idfs, value)

        sorted_sims = sorted(sims.iteritems(), key=operator.itemgetter(1), reverse=True)
        for pair in sorted_sims:
            if pair[1] > 0:
                print "    ", pair[0], ":", pair[1] 

    def do_search_doc(self, docid):
        sims = {}
        for dfile in self.data:
            for key, value in dfile.docs.iteritems():
                sims[key] = self.do_sim(docid, key)    
     
        sorted_sims = sorted(sims.iteritems(), key=operator.itemgetter(1), reverse=True)
        print "Most relevant documents:"
        for pair in sorted_sims:
            if pair[1] > 0:
                print "    ", pair[0], ":", pair[1] 


    def query_similarity(self, query_wgt, doc):
        doc_wgts = doc.tf_idf()
        sim = 0

        for key, value in query_wgt.iteritems():
            if doc_wgts.has_key(key):
                sim += doc_wgts[key]
        return sim

    def do_read_list(self, lst):
        myf = open(self.filename, 'r')
        for line in myf.readlines():
            self.do_read(line)
        myf.close()

    def do_quit(self):
        return None

    def show_consol(self):
        values = {
                    'clear'     : self.do_clear,
                    'print'     : self.do_print,
                    'read'      : self.do_read,
                    'list'      : self.do_list,
                    'read_list' : self.do_read_list,
                    'show'      : self.do_show,
                    'sim'       : self.do_sim,
                    'search'    : self.do_search,
                    'search_doc': self.do_search_doc,
                    'quit'      : self.do_quit
                 }

        while True:
            self.show_menu()
            try:
                choice = sys.stdin.readline()
            except KeyboardInterrupt:
                break
            
            current_opt = choice.replace('\n', '').split(' ')
            if not values.has_key(current_opt[0].lower()):
                continue
            
            func = values[current_opt[0].lower()]

            if current_opt[0] == 'quit':
               return 
            elif current_opt[0] == 'search' and "\"" in choice: 
                cs = choice.split('"')
                func(cs[1])
            elif len(current_opt) == 3:
                func(current_opt[1], current_opt[2])
            elif len(current_opt) == 2:
                func(current_opt[1])
            elif len(current_opt) == 1: 
                func()

    def show_menu(self):
        print "Document Collection Options:\n   -CLEAR\n   -PRINT <docID>\n   -SHOW <docID>\n   -READ <filename>\n   -READ_LIST <list>\n   -SIM    <docID> <docID>\n   -SEARCH_DOC <docID>\n   -SEARCH <query>\n   -QUIT"
class SentenceSplitter(object):
    """
    Разбиваем отдельные предложения на токены, токены стеммируем. 
    При этом сохраняется структура текста, т.е. абзацы.
    На вход принимаем список предложений, на выходе возвращаем
    список стем в виде [[[],[],[]],[[],[]]], где второй уровень 
    вложенности - это абзацы, третий - сами преложения.
    """
    def __init__(self, stopwords, VERBTRANSFORMS, NOUNTRANSFORMS, lexicon_de, language):

        self.language = language
        
        self.stopwords = stopwords

        self.VERBTRANSFORMS = VERBTRANSFORMS
        self.NOUNTRANSFORMS = NOUNTRANSFORMS

        # знаки, которые будут удаляться в начале и конце токена
        self.punctuation = "∙!‼¡\"#£€$¥%&'()*+±×÷·,-./:;<=>?¿@[\]^ˆ¨_`—–­{|}~≈≠→↓¬’“”«»≫‘…¦›🌼′″¹§¼⅜½¾⅘©✩✒•►●★❤➡➜➚➘➔✔➓➒➑➐➏➎➍➌➋➊❸❷■†✝✌️³‎²‚„ ​"
        # для разбивки на токены по пробелам и слешам
        self.splitchars = re.compile(r'[\s\\\/\(\)\[\]\<\>\;\:\,\‚\—\?\!\|\"«»…#]|\.\.\.+|[ �⌂ ∞½¾►=]|\-\-|\.[\'\"’“”«»‘′″„-]')

        if self.language == 'ru':
            self.stemmer = RussianStemmer()
            # объект pymorphy2.MorphAnalyzer(), будем использовать атрибут normal_form
            self.lemmatizer_ru = pymorphy2.MorphAnalyzer()
            self.normalizer = NormalizerRU()
        elif self.language == 'de':
            self.stemmer = GermanStemmer()
            self.normalizer = NormalizerDE()
            self.lexicon_de = lexicon_de
        else:
            self.stemmer = PorterStemmer()
            self.normalizer = NormalizerEN()


    def tokenizeString(self, sentence):

        """
        Функция последовательной обработки каждого слова. Получает на вход предложение, создает список tokens,
        складывает туда выделенные re.split'ом слова, 'отрезая' пунктуацию с концов слова и понижая регистр, 
        и удаляет по ходу окончания-сокращения функцией del_contractions.
        Дальше заменяет неправильные формы глаголов и сущ-х правильными (и расставляет теги 
        определённых маркеров).
        """

        # генератор списка токенов: по циклу: разбиваем строку на токены по regexp splitchars,
        # 2. удаляем знаки вокруг токена, приводим к нижнему регистру, 
        if self.language == 'ru':
            tokens = (self.normalizer.normalizeLetters(token.strip(self.punctuation).lower()) for token in self.splitchars.split(sentence))

        elif self.language == 'de':
            tokens = (self.normalizer.normalizeLetters(self.normalizer.deleteContrs(token.strip(self.punctuation).lower())) for token in self.splitchars.split(sentence))

        else:
            tokens = (self.normalizer.token_transform(self.normalizer.del_contractions(token.strip(self.punctuation).lower()), self.VERBTRANSFORMS, self.NOUNTRANSFORMS) for token in self.splitchars.split(sentence))

        return tokens


    def tokenizeWithCase(self, sentence):

        """
        Такая же функция токенизации, только без приведения слов к нижнему регистру

        """

        # генератор списка токенов: по циклу: разбиваем строку на токены по regexp splitchars,
        # 2. удаляем знаки вокруг токена, приводим к нижнему регистру
        if self.language == 'ru':
            tokens = (self.normalizer.normalizeLetters(token.strip(self.punctuation)) for token in self.splitchars.split(sentence))
            tokens_with_case = [token for token in tokens if token.lower() not in self.stopwords]

        elif self.language == 'de':
            tokens = (self.normalizer.normalizeLetters(self.normalizer.deleteContrs(token.strip(self.punctuation))) for token in self.splitchars.split(sentence))
            tokens_with_case = [token for token in tokens if token.lower() not in self.stopwords]
        else:
            tokens = (self.normalizer.token_transform(self.normalizer.del_contractions(token.strip(self.punctuation)), self.VERBTRANSFORMS, self.NOUNTRANSFORMS) for token in self.splitchars.split(sentence))
            tokens_with_case = [token for token in tokens if token.lower() not in self.stopwords]

        
        return tokens_with_case


    def stemTokens(self, sentence):
        """
        Функция формирует список стеммированных терминов с удалением стоп-слов.
        Возвращает список кортежей, в которых содержатся стеммы значимых слов и
        сами слова. (Это необходимо для последующего извлечения ключевых слов)
        """

        # генератор списка терминов: если термин не в списке стоп-слов, то стеммируем его.
        if self.language == 'ru':
            stemmed_sentence = ((self.stemmer.stem(self.lemmatizer_ru.parse(term)[0].normal_form), term) for term in self.tokenizeString(sentence) if term not in self.stopwords)
        
        elif self.language == 'de':
            stemmed_sentence = ((self.stemmer.stem(self.normalizer.lemmatize(term, self.lexicon_de)), term) for term in self.tokenizeString(sentence) if term not in self.stopwords)

        else:
            stemmed_sentence = ((self.stemmer.stem(term, 0, len(term)-1), term) for term in self.tokenizeString(sentence) if term not in self.stopwords)
        
        
        if not stemmed_sentence:
            return []

        else:
            return stemmed_sentence


    def tokenizeListParagraphs(self, list_of_sentences):
        """
        Получает список предложений, сгруппированных по абзацам.
        Каждое слово из списка стеммированных токенов складывает
        в новый список с сохранением структуры абзацев.
        """

        tokenized_sentences = []

        for sentences in list_of_sentences:

            terms_list = []

            for s in sentences:

                terms_in_sentence = []

                for term_pair in self.stemTokens(s):

                    if len(term_pair[0]) > 0:

                        terms_in_sentence.append(term_pair)

                terms_list.append(terms_in_sentence)

            tokenized_sentences.append(terms_list)

        return tokenized_sentences


    def tokenizeListSentences(self, list_of_sentences):
        """
        Получает список предложений. (без абзацев)
        
        """

        tokenized_sentences = []

        for s in list_of_sentences:

            terms_in_sentence = []

            for term_pair in self.stemTokens(s):

                if len(term_pair[0]) > 0:

                    terms_in_sentence.append(term_pair)

            tokenized_sentences.append(terms_in_sentence)

        return tokenized_sentences

    def tokenizeSentencesWithCaseKeeping(self, list_of_sentences):
        """
        Получает список предложений с сохранением регистра. (без абзацев)
        
        """

        tokenized_sentences = []

        for s in list_of_sentences:

            terms_in_sentence = []

            for term in self.tokenizeWithCase(s):

                if len(term) > 0:

                    terms_in_sentence.append(term)

            tokenized_sentences.append(terms_in_sentence)

        return tokenized_sentences
class BuildTermSpace(object):

	"""
	Создание json-объекта в виде словаря, в котором хранятся стеммы
	значимых слов и их частотность из указанных корпусов.
	"""

	def __init__(self, language='en', action='tfidf'):

		# Вызываем LoadExternalLists, создаем список стоп-слов, 
		# загружаем немецкий лексикон,
		# 
		self.language = language
		self.action = action

		# знаки, которые будут удаляться в начале и конце токена
		self.punctuation = "∙!‼¡\"#£€$¥%&'()*+±×÷·,-./:;<=>?¿@[\]^ˆ¨_`—–­{|}~≈≠→↓¬’“”«»≫‘…¦›🌼′″¹§¼⅜½¾⅘©✒•►●★❤➡➜➚➘➔✔➓➒➑➐➏➎➍➌➋➊❸❷■†✝✌️³‎²‚„ ​"

		loadRes = LoadExternalLists()

		if self.language == 'de':
			self.stopwords = loadRes.loadStopWordsDE()
			# объект стеммера
			self.stemmer = GermanStemmer()
			# немецкий словарь
			print '\n', "Loading German Dictionary... OK", '\n'
			self.lexicon_de = loadRes.loadLexiconDe()
			self.normalizer = NormalizerDE()
		elif self.language == 'ru':
			self.stopwords = loadRes.loadStopWordsRU()
			self.stemmer = RussianStemmer()
			# объект pymorphy2.MorphAnalyzer(), будем использовать атрибут normal_form
			self.lemmatizer_ru = pymorphy2.MorphAnalyzer()
			self.normalizer = NormalizerRU()
		else:
			self.stopwords = loadRes.loadStopWordsEN()
			self.stemmer = PorterStemmer()
			self.normalizer = NormalizerEN()
			# список неправ. гл.
			self.irreg_verbs = loadRes.loadVerbForms()
			# список неправ. сущ-х
			self.irreg_nouns = loadRes.loadNounforms()
	

	def processString(self, line):
		"""
		Функция последовательной обработки каждого слова. Получает на вход строку, создает список tokens,
		складывает туда выделенные re.split'ом слова, 'отрезая' пунктуацию с концов слова и понижая регистр, 
		и удаляет по ходу окончания-сокращения функцией del_contractions
		Дальше переходит к формированию списка стеммированных терминов rslt_list с удалением стоп-слов
		и цифровых последовательностей.
		Возвращает список rslt_list, в котором содержатся только стеммы значимых слов.
		"""

		# для разбивки на токены по пробелам и слешам
		splitchars = re.compile(r'[\s\\\/\(\)\[\]\<\>\;\:\,\‚\—\?\!\|\"«»…#]|\.\.\.+|[ �⌂ ∞½¾►=]|\-\-|\.[\'\"’“”«»‘′″„-]') # [\.\:][\'\"’“”«»‘′″]

		# для игнорирования токенов, содержащих цифры
		esc_num = re.compile(r'[0-9]+')

		# для игнорирования URL
		#url_esc = re.compile(r'([a-z]{3,6}:\/\/)?([a-zA-Z0-9\-@?]+[\.|\:])+[a-z]{2,13}[\.\?\=\&\%\,\#\+\(\)\/\w\-]*')

				
		if self.language == 'de':
			tokens = (self.normalizer.normalizeUmlaut(self.normalizer.deleteContrs(token.strip(self.punctuation).lower())) for token in splitchars.split(line))
			rslt_list = (self.stemmer.stem(self.normalizer.lemmatize(term, self.lexicon_de)) for term in tokens if term not in self.stopwords and not esc_num.search(term) and len(term)>0)	# and not esc_num.search(term) - включить после услоия на стоп-слова, если нужно удалять токены с цифрами

		elif self.language == 'ru':
			tokens = (self.normalizer.normalizeE(token.strip(self.punctuation).lower()) for token in splitchars.split(line))
			rslt_list = (self.stemmer.stem(self.lemmatizer_ru.parse(term)[0].normal_form) for term in tokens if term not in self.stopwords and not esc_num.search(term) and len(term)>0)	# and not esc_num.search(term) - включить после услоия на стоп-слова, если нужно удалять токены с цифрами

		else:
			# генератор списка токенов: по циклу: разбиваем строку на токены по regexp splitchars,
			# 2. удаляем знаки вокруг токена, приводим к нижнему регистру, 
			# 3. трансформируем форму неправ. глаг. в правильную
			# 4. удаляем окончания-сокращения с \'
			tokens = (self.normalizer.token_transform(self.normalizer.del_contractions(token.strip(self.punctuation).lower()), self.irreg_verbs, self.irreg_nouns) for token in splitchars.split(line))

			# генератор списка терминов: если термин не в списке стоп-слов и не содержит цифр, то стеммируем его.
			rslt_list = (self.stemmer.stem(term, 0, len(term)-1) for term in tokens if term not in self.stopwords and not esc_num.search(term) and len(term)>0)	# and not esc_num.search(term) - включить после услоия на стоп-слова, если нужно удалять токены с цифрами
		

		if not rslt_list:
			return []

		else:
			return rslt_list


	def processFile(self, filename):
		"""
		Читает файл в utf-16, для каждой строки файла вызывает функцию processString,
		каждое слово из получившегося списка добавляет в set terms_set, избавляясь от
		дубликатов.
		Возвращает список уникальных лемм.
		"""

		terms_set = set()
		terms_list = []

		if self.action == 'tfidf':
			try:
				with codecs.open(filename, 'r', 'utf-16') as infile:
					
					for line in infile:
						if len(line) > 1:
							for term in self.processString(line):
								terms_set.add(term)
								
			except (UnicodeDecodeError, UnicodeError, IOError):
				pass

			return terms_set

		if self.action == 'raw':
			try:
				with codecs.open(filename, 'r', 'utf-16') as infile:
					
					for line in infile:
						if len(line) > 1:
							for term in self.processString(line):
								terms_list.append(term)
								
			except (UnicodeDecodeError, UnicodeError, IOError):
				pass

			return terms_list


	def crawl(self, dirname):
		"""
		Функция проходит по папкам и подпапкам указанной в качестве аргумента директории.
		Проверяет, если файл текстовый, то запускает функцию processFile и складывает результат
		её работы в set terms_set.
		В общем terms_dict подсчитывается частотность каждой леммы, словарь сохраняется как json.
		terms_dict отражает по сути вторую часть формулы tfidf, т.е. показывает в каком количестве
		документов встретился термин.
		"""

		docs_num = 0

		terms_dict = defaultdict(int)
		
		for root, dirs, files in os.walk(dirname):

			print root, "processing..."
			
			for filename in files:

				if filename.endswith('.txt') or filename.endswith('.TXT'):
					
					print filename

					terms_set = self.processFile(join(root,filename))

					for term in terms_set:

						terms_dict[term] += 1

					docs_num+=1
					
		if self.action == 'raw':
			with codecs.open(r'.\termSpace\\'+self.language.upper()+'frequency_list_stem.txt', 'w', 'utf-16') as outfile:
				for key, value in sorted(terms_dict.iteritems(), key=lambda x:x[1], reverse=True):
					outfile.write(key+'\t'+str(value))
					outfile.write('\n')
		
		if self.action == 'tfidf':

			with open(r".\termSpace\\" + self.language.upper() + "CorpusDict_" + str(docs_num) + ".json", 'w') as  outfile:
				json.dump(terms_dict, outfile)
Ejemplo n.º 46
0
# MIT Licensed
# Copyright 2014 REM <*****@*****.**>.

from pymongo import MongoClient
from pymongo import DESCENDING
import utility
from porter import PorterStemmer

p = PorterStemmer()

client = MongoClient('localhost', 27017)
db = client.uberly
clt = db.uber_vocab_1

#db.uber_dictionary.find().limit(50).sort({value:-1}).pretty()
for entry in clt.find().sort([('value', DESCENDING)]):
  entry['stem'] = p.stem(entry['_id'], 0,len(entry['_id'])-1)
  clt.save(entry)
  print(entry['_id']), entry['stem']
  
Ejemplo n.º 47
0
class Preprocessor:
    '''
    Constructor.
    Initializes object with pre-compiled regexes.
    '''
    def __init__(self):

        self.spec_chars_regex = re.compile('[^0-9a-zA-Z]')
        self.camel_case_regex_1 = re.compile('(.)([A-Z][a-z]+)')
        self.camel_case_regex_2 = re.compile('([a-z0-9])([A-Z])')
        self.stemmer = PorterStemmer()  # from Gupta's Porter Stemmer

    '''
    removeSpecialChars(self, string)
    Returns a copy of string with all non-alphanumeric characters replaced by
    whitespace.
    '''

    def removeSpecialChars(self, string):
        return self.spec_chars_regex.sub(' ', string)

    '''
    splitCamelCase(self, string)
    Returns a copy of string with each camelCase word split into separate words
    with whitespace in between.
    '''

    def splitCamelCase(self, string):
        newString = self.camel_case_regex_1.sub(r'\1 \2', string)
        return self.camel_case_regex_2.sub(r'\1 \2', newString)

    '''
    porterStem(self, words)
    Returns a list of each word in words that have been stemmed by the Porter
    Stemmer.
    '''

    def porterStem(self, words):
        stems = [""] * len(words)
        return [self.stemmer.stem(word, 0, len(word) - 1) for word in words]

    '''
    preprocess(self, string)
    Replaces all special (non-alphanumeric) characters in string with
        whitespace, splits all camelCase words in string into separate words
        with whitespace in between, splits all remaining words into a list of
        all words (in lowercase) found in the edited string, and returns a list
        of all the words' stems via the Porter Stemmer.
    '''

    def preprocess(self, string):
        newString = self.removeSpecialChars(string)
        newString = self.splitCamelCase(newString)
        words = newString.lower().split()
        return self.porterStem(words)

    '''
    prepDoc(self, doc)
    Opens the document specified as doc, preprocesses each line and returns a
    list of arrays containing the preprocessed lines of the document.
    '''

    def prepDoc(self, doc, combine=False):
        texts = []
        with open(doc) as file:
            for line in file:
                prepLine = self.preprocess(line)
                if len(prepLine) > 0:
                    texts.append(prepLine)
        if combine == False:
            return texts
        else:
            return self.combineVectors(texts)

    '''
    combineVectors(self, texts)
    Turns a list of vectors into one long vector, and returns this long vector.
    '''

    def combineVectors(self, texts):
        newTexts = []
        for text in texts:
            newTexts += text
        return newTexts
Ejemplo n.º 48
0
                elif (query < lData[0]):
                    print (query+ " < "+lData[0])
                    return search(query,index+1,middle,m1)
                elif (query > lData[0]):
                    print (query+ " > "+lData[0]+" so go to  index : "+str(index+middle))
                    return search(query,index+middle,middle,m1)
                
                
                else:
                    return []


        print(" size : "+str(int(m1.size()/35)))
        p = PorterStemmer()
        query= sys.argv[1]
        query = p.stem(query,0,len(query)-1)
           
        lBytes = search(query,0,int(m1.size()/35),m1)
        print("lBytes : ",lBytes)

        if(lBytes != None):
            termFile = "./indexes/terms.txt"
            with open(termFile, "r+b") as f:
                # memory-map the file, size 0 means whole file
                map = mmap.mmap(f.fileno(), 0)
                print("Term stuff :  ",map[int(lBytes[0]):int(lBytes[1])])

        else:
            print("Not found")
'''            termsListFile = "./indexes/termsList.txt"
            fl = open(termsListFile,"r")
Ejemplo n.º 49
0
# Quick example of the stemming

from porter import PorterStemmer
from lancaster import LancasterStemmer

porter = PorterStemmer()
lancaster = LancasterStemmer()

words = [
    "dogs", "cats", "cafes", "shops", "bookshops", "bars", "cafe", "columbia",
    "coffee", "coffees", "outdoors"
]

for w in words:
    print("Word: {0}".format(w))
    print("Stem: {0}".format(porter.stem(w)))
    print("======")