Example #1
0
 def stem_words(self, terms):
     """
     Remove the suffixes in terms.
     """
     porter_stemmer = PorterStemmer()  #we use the porter stemming algoritm
     terms = [porter_stemmer.stem(word) for word in terms]
     return terms
Example #2
0
def main(argv):
    files = os.listdir(sys.argv[1])
    file = files[0]
    stemmed = []
    for file in files:
        text = ""
        infile = open(sys.argv[1] + file)
        a = infile.readline()
        while a:
            text += removeSGML(a)
            a = infile.readline()
        tok = tokenizeText(text)
        removed = removeStopwords(tok)
        from porter import PorterStemmer
        p = PorterStemmer()
        for element in removed:
            stemmed.append(p.stem(element, 0, len(element) - 1))
    print "Words " + str(len(stemmed))
    unique = list(set(stemmed))
    print "Vocabulary " + str(len(unique))
    wordfrequency = [(unique[x], stemmed.count(unique[x]))
                     for x in range(0, len(unique))]
    sort = sorted(wordfrequency, key=getKey, reverse=True)
    for i in range(0, 49):
        print sort[i]
Example #3
0
def stemWords(input):
    porter = PorterStemmer()
    words = input

    for index, word in enumerate(words):
        words[index] = porter.stem(word, 0, len(word) - 1)
    return words
	def __init__(self, language='en', action='tfidf'):

		# Вызываем LoadExternalLists, создаем список стоп-слов, 
		# загружаем немецкий лексикон,
		# 
		self.language = language
		self.action = action

		# знаки, которые будут удаляться в начале и конце токена
		self.punctuation = "∙!‼¡\"#£€$¥%&'()*+±×÷·,-./:;<=>?¿@[\]^ˆ¨_`—–­{|}~≈≠→↓¬’“”«»≫‘…¦›🌼′″¹§¼⅜½¾⅘©✒•►●★❤➡➜➚➘➔✔➓➒➑➐➏➎➍➌➋➊❸❷■†✝✌️³‎²‚„ ​"

		loadRes = LoadExternalLists()

		if self.language == 'de':
			self.stopwords = loadRes.loadStopWordsDE()
			# объект стеммера
			self.stemmer = GermanStemmer()
			# немецкий словарь
			print '\n', "Loading German Dictionary... OK", '\n'
			self.lexicon_de = loadRes.loadLexiconDe()
			self.normalizer = NormalizerDE()
		elif self.language == 'ru':
			self.stopwords = loadRes.loadStopWordsRU()
			self.stemmer = RussianStemmer()
			# объект pymorphy2.MorphAnalyzer(), будем использовать атрибут normal_form
			self.lemmatizer_ru = pymorphy2.MorphAnalyzer()
			self.normalizer = NormalizerRU()
		else:
			self.stopwords = loadRes.loadStopWordsEN()
			self.stemmer = PorterStemmer()
			self.normalizer = NormalizerEN()
			# список неправ. гл.
			self.irreg_verbs = loadRes.loadVerbForms()
			# список неправ. сущ-х
			self.irreg_nouns = loadRes.loadNounforms()
Example #5
0
    def search(self, word):
        # Create an instance of the Porter Stemmer.
        PS = PorterStemmer()

        # Get the information for the supplied word.
        res = self.manage_DB.get_info("index_word", where={"word": PS.stem(word, 0, len(word) - 1)})

        # The supplied word exist in the index_word table.
        if res:
            # Extract the id for the supplied word.
            wordid = res["id"]

            # Get all the entries in the index reference database that refer to
            # the supplied wordid.
            res = self.manage_DB.get_info("index_ref", where={"wordid": wordid})

            # For ever entry in the list.
            for row in res:
                # Modify the current row to contain the stem word.
                row["word"] = self.manage_DB.get_info("index_word", rowid=row[1])["word"]
                # Modify the current row to contain the document name.
                row["doc"] = self.manage_DB.get_info("document", rowid=row[2])["name"]

            # Return the list of all the results.
            return res
        # The supplied word does not exist in the index_word table, so return
        # and empty list.
        else:
            return []
Example #6
0
 def stem_words(self, terms):
     """
     Remove the suffixes in terms.
     """
     porter_stemmer = PorterStemmer() #we use the porter stemming algoritm
     terms = [porter_stemmer.stem(word) for word in terms]
     return terms 
    def __init__(self, stopwords, VERBTRANSFORMS, NOUNTRANSFORMS, lexicon_de, language):

        self.language = language
        
        self.stopwords = stopwords

        self.VERBTRANSFORMS = VERBTRANSFORMS
        self.NOUNTRANSFORMS = NOUNTRANSFORMS

        # знаки, которые будут удаляться в начале и конце токена
        self.punctuation = "∙!‼¡\"#£€$¥%&'()*+±×÷·,-./:;<=>?¿@[\]^ˆ¨_`—–­{|}~≈≠→↓¬’“”«»≫‘…¦›🌼′″¹§¼⅜½¾⅘©✩✒•►●★❤➡➜➚➘➔✔➓➒➑➐➏➎➍➌➋➊❸❷■†✝✌️³‎²‚„ ​"
        # для разбивки на токены по пробелам и слешам
        self.splitchars = re.compile(r'[\s\\\/\(\)\[\]\<\>\;\:\,\‚\—\?\!\|\"«»…#]|\.\.\.+|[ �⌂ ∞½¾►=]|\-\-|\.[\'\"’“”«»‘′″„-]')

        if self.language == 'ru':
            self.stemmer = RussianStemmer()
            # объект pymorphy2.MorphAnalyzer(), будем использовать атрибут normal_form
            self.lemmatizer_ru = pymorphy2.MorphAnalyzer()
            self.normalizer = NormalizerRU()
        elif self.language == 'de':
            self.stemmer = GermanStemmer()
            self.normalizer = NormalizerDE()
            self.lexicon_de = lexicon_de
        else:
            self.stemmer = PorterStemmer()
            self.normalizer = NormalizerEN()
Example #8
0
def stemList(list1):
    p = PorterStemmer()
    lAllWordsStemmed = []
    for word in list1:
        word = p.stem(word,0,len(word)-1)
        lAllWordsStemmed.append(word)
    return lAllWordsStemmed
def stemWords(tokens):
    ''' 
    input: list of tokens
    output: list of stemmed tokens
    use the porter.py
    '''
    p = PorterStemmer()
    return map(lambda t: p.stem(t, 0, len(t) - 1), tokens)
Example #10
0
def get_stemmed_words(word_list):
    stemmer = PorterStemmer()
    stemmed_words = set()
    
    for word in word_list:
        stemmed_words = stemmed_words.union(stemmer.stem(word, 0,len(word)-1))
        
    return stemmed_words
Example #11
0
def process(input):
    s2 = tokenizeText(input)
    s3 = removeStopwords(s2)
    pr = []
    from porter import PorterStemmer
    p = PorterStemmer()
    for element in s3:
        pr.append(p.stem(element, 0, len(element) - 1))
    return pr
Example #12
0
def read_vocabulary(path):
    f = open(path)
    vocab = f.read()
    words = vocab.strip().split(", ")
    vocab = []
    stemmer = PorterStemmer()
    for word in words:
        vocab.append(stemmer.stem(word, 0, len(word)-1))
    return vocab
Example #13
0
def process(input):
    s3 = tokenizeText(input)
    """s3 = removeStopwords(s2)"""
    pr = []
    from porter import PorterStemmer
    p = PorterStemmer()
    for element in s3:
        pr.append(p.stem(element, 0, len(element)-1))
    return pr
Example #14
0
    def create_posting_list(self, stopword_toggle, stemming_toggle):
        """
        function to go through all the documents abstracts cleaning
        and adding each term to a posting_list object and the
        term dictionary. removes all the special characters for each
        term. toggles stopwords and stemming accordingly

        Note: all terms are converted to lowercase

        :param stopword_toggle: boolean, toggles the stopword usage
        :param stemming_toggle: boolean, toggles the stemming of words
        """
        self.terms = {}
        self.termsDictionary = {}
        documents = self.documents
        stopwords = []
        if stopword_toggle:
            stopwords = fetch_stopwords()
        for doc_id, document in documents.items():
            if 'abstract' in document:
                for index, word in enumerate(document['abstract'].split(' ')):
                    word = word.rstrip().lower()

                    for a in [',', '.', '{', '}', '(', ')', ';', ':', '"', '\'']:
                        if a in word:
                            if word.index(a) == 0 or word.index(a) == len(word) - 1:
                                word = word.replace(a, '')
                    if stemming_toggle:
                        p = PorterStemmer()
                        word = p.stem(word, 0, len(word) - 1)

                    if word in stopwords:
                        continue

                    if len(word) > 0:
                        if word not in self.terms.keys():
                            self.terms[word] = {}

                        if doc_id not in self.terms[word].keys():
                            self.terms[word][doc_id] = {
                                'frequency': 0,
                                'position': [],
                            }

                        self.terms[word][doc_id]['frequency'] += 1
                        self.terms[word][doc_id]['position'].append(index)

        for term, value in self.terms.items():
            self.termsDictionary[term] = len(value)

        f = open('dictionary.json', 'w')
        f.write(json.dumps(self.termsDictionary, indent=4, sort_keys=True))
        f.close()

        f = open('posting-list.json', 'w')
        f.write(json.dumps(self.terms, indent=4, sort_keys=True))
        f.close()
Example #15
0
    def index_document(self, docid, path_physical):
        self.manage_DB.delete_references(docid)

        # Get the information for the supplied document.
        document = self.manage_DB.get_info('document', rowid=docid)

        # Open the document for reading.
        fhandle = open('%s%s' % (path_physical, docid), 'r')
        # Create an instance of the Porter Stemmer.
        PS = PorterStemmer()

        # Get the 1st line of the supplied document and force the contents to
        # lowercase.
        content = fhandle.readline().lower()

        # The text widget starts indexing its lines at 1, but columns start
        # indexing at 0.
        line_count = 1

        # While the supplied document has content to be read.
        while content != '':
            # Find all words from the current line of the supplied document
            # and put them in a list.
            words = re.findall('\w+', content)

            # For each word in the list of words from the current line.
            for word in words:
                # Only words whose length is greater than 3 will be indexed.
                if len(word) > 3:
                    # Check for the word in the list of stop words.
                    res = self.manage_DB.get_info('stop_words', where={
                        'word': word})

                    # If the word does not exist in the list of stop words:
                    if not res:
                        # The column of the current word is its index in the
                        # current line.
                        col_count = content.find(word) + 1
                        # Using the PorterStemmer, find the root of the current
                        # word. Add the root word, with the current line and
                        # column number to the index.
                        self.add_index_word(
                            PS.stem(word, 0, len(word) - 1),
                            docid,
                            line_count,
                            col_count,
                            word)
            # Get the next line of the supplied document and force the
            # contents to lowercase.
            content = fhandle.readline().lower()
            # Increment the line count.
            line_count += 1

        # Close the supplied document file.
        fhandle.close()
        return
Example #16
0
def stem(tokens):
    """
    receive tokens
    return stemmedTokens
    """
    stemmedTokens = []
    stemmer = PorterStemmer()
    for token in tokens:
        stemmedTokens.append(stemmer.stem(token, 0, len(token) - 1))

    return stemmedTokens
Example #17
0
def tokenize(inputStr):
	tokenPattern = re.compile(r'[^a-zA-Z0-9.,_]')	
#	tokenPattern = re.compile(r'[\s:?;()\[\]&!*@#$%+<>/\\\'\"]|\.(\.)+|(-)+')
	primordialTokens = re.split(tokenPattern, inputStr)
#	primordialTokens = inputStr.replace(">", " ").replace("...", " ").replace("-"," ").replace("'"," ").replace("/"," ").split(' ')
	stripPuncTokens = [x.strip(',.').replace(",","").lower() for x in primordialTokens if x != None]
	stripPuncTokens = [x for x in stripPuncTokens if x != '' and x not in stop_words]

	#stemming
	p = PorterStemmer()
	stemmedTokens = [p.stem(x, 0, len(x)-1) for x in stripPuncTokens]
	return stemmedTokens
Example #18
0
def porter(text):
    p = PorterStemmer()
    output = ''
    word = ''
    line = text.split('\n')
    for c in line:
        if c.isalpha():
            word += c.lower()
        else:
            if word:
                output += p.stem(word, 0, len(word) - 1)
                word = ''
            output += c.lower()
    return output.split()
Example #19
0
  def add(self, text, fname, stem=False):
    """Add a string of text to the corpus by first splitting it into features
    defined by WORD_PAT, and then removing stop words.

    Takes a string as its argument."""
    for match in re.finditer(self.WORD_PATT, text):
      if match:
        word = match.group(0).lower()
        if word in self.STOPWORDS:
          self.removed.append(word)
          self.words.add_word(word, fname)
          continue
        if stem:
          p = PorterStemmer()
          word = p.stem(word, 0, len(word)-1)
Example #20
0
def revise_documents(docs, vocab):
    stemmer = PorterStemmer()
    senses = {}     # {reference:sense}
    for ref, text in docs.items():
        words = re.findall(r"[\w']+", text)
        word_list = []
        for w in words:
            if w == "tag":
                continue
            if w.isdigit() and int(w) > 100000:
                senses[ref] = w
                continue
            if w in vocab:
                word_list.append(stemmer.stem(w.lower(), 0, len(w) - 1))
        docs[ref] = word_list
    return docs, senses
Example #21
0
class WindowPorterStemStringFeature(object):
    def __init__(self):
        self._stemmer = PorterStemmer()

    def get_id(self):
        return 'WINDOW-STEM-STRING'
    
    def _stem(self, token):
        return self._stemmer.stem(token, 0, len(token) - 1)

    def featurise(self, document, sentence, annotation):
        NORMALISE = True

        before_ann = sentence.text[:annotation.start].split()
        before_ann.reverse()
        after_ann = sentence.text[annotation.end:].split()

        to_yield = []
        for i, tok in izip(xrange(1, 4), before_ann):
            to_yield.append((u'-BEFORE-{}-{}'.format(i, self._stem(tok)), 1))
        for i, tok in izip(xrange(1, 4), after_ann):
            to_yield.append((u'-AFTER-{}-{}'.format(i, self._stem(tok)), 1))
        for f_tup in to_yield:
            if NORMALISE:
                yield (f_tup[0], f_tup[1] / float(len(to_yield)))
            else:
                yield f_tup
Example #22
0
class WindowPorterStemStringFeature(object):
    def __init__(self):
        self._stemmer = PorterStemmer()

    def get_id(self):
        return 'WINDOW-STEM-STRING'

    def _stem(self, token):
        return self._stemmer.stem(token, 0, len(token) - 1)

    def featurise(self, document, sentence, annotation):
        NORMALISE = True

        before_ann = sentence.text[:annotation.start].split()
        before_ann.reverse()
        after_ann = sentence.text[annotation.end:].split()

        to_yield = []
        for i, tok in izip(xrange(1, 4), before_ann):
            to_yield.append((u'-BEFORE-{}-{}'.format(i, self._stem(tok)), 1))
        for i, tok in izip(xrange(1, 4), after_ann):
            to_yield.append((u'-AFTER-{}-{}'.format(i, self._stem(tok)), 1))
        for f_tup in to_yield:
            if NORMALISE:
                yield (f_tup[0], f_tup[1] / float(len(to_yield)))
            else:
                yield f_tup
Example #23
0
class Pepper(object):
    """
    The Pepper Pots of UI (Public Relations) for Tony Stark. Handles the user
    inputting queries, parsing the queries, and returning results from the
    indexed corpus by Ironman
    """

    def __init__(self, documents, NDC, stop_words):
        super(Pepper, self).__init__()
        self.documents = documents
        self.NDC = NDC
        self.p = PorterStemmer()
        self.stop_words = stop_words

    def handleQuery(self, user_input):
        """
        Handles the process of formatting a user_inputted query
        """
        scores = []
        stem_query = self.p.stemText(user_input, self.stop_words).encode('utf_8', 'ignore')
        query = Document(stem_query, full_text=user_input)
        self.NDC.normalize(query)
        for document in self.documents:
            scores.append((self.NDC.score(query, document), document))
        scores = sorted(scores, reverse=True)
        return scores

    def score(query, document):
        return 1
Example #24
0
    def __init__(self, db_file, indexer):
        self.db_file = db_file
        self.indexer = indexer

        self.db = sqlite3.connect(db_file)
        self.p = PorterStemmer()
        self.sw = stopwords.StopWords(self.stopword_file)

        self.re_tag = RE_TAG
Example #25
0
    def score_query(self, query, word_matrix, normalized_matrix,
                    stop_words_list, title_vocabulary_dict):
        porter_stemmer = PorterStemmer()
        square_sum = 0
        words = {}

        for word in query:
            word_without_punctuation = word.strip(string.punctuation).replace(
                " ", "").lower()
            if word_without_punctuation not in stop_words_list:
                stemmed_word = porter_stemmer.stem(
                    word_without_punctuation, 0,
                    len(word_without_punctuation) - 1)
                if stemmed_word not in words:
                    words[stemmed_word] = {}
                    words[stemmed_word]['repetitions'] = 0
                words[stemmed_word]['repetitions'] += 1

        for word, elements in words.items():
            square_sum += math.pow(elements['repetitions'], 2)
        for word, elements in words.items():
            if word in word_matrix:
                words[word]['normalized'] = words[word][
                    'repetitions'] / math.sqrt(square_sum)
                words[word]['weight'] = words[word][
                    'normalized'] * word_matrix[word]['idf']
            else:
                words[word]['normalized'] = 0
                words[word]['weight'] = 0
        aggregate_scores = {}
        title_addition_performed = []
        for word, elements in words.items():
            if word in normalized_matrix:
                for doc_id, doc_weight in normalized_matrix[word].items():
                    if doc_id not in aggregate_scores:
                        aggregate_scores[doc_id] = 0
                    aggregate_scores[doc_id] += doc_weight * elements['weight']
                    if word in title_vocabulary_dict:
                        if doc_id in title_vocabulary_dict[
                                word] and doc_id not in title_addition_performed:
                            aggregate_scores[doc_id] += 0.5
                            title_addition_performed.append(doc_id)
        return aggregate_scores
Example #26
0
    def load(self, ndxfile):
        with open(ndxfile, "r") as f:
            self.ndx = json.loads(f.read())
        self.p = PorterStemmer()
        self.sw = stopwords.StopWords(self.stopword_file)

        for w, val in self.ndx.items():
            for d in val:
                self.dim_map[d] = len(self.dims) - 1
                self.dims.add(d)
Example #27
0
    def search(self, word):
        # Create an instance of the Porter Stemmer.
        PS = PorterStemmer()

        # Get the information for the supplied word.
        res = self.manage_DB.get_index_word_info(
            PS.stem(word, 0, len(word) - 1))

        # The supplied word exist in the index_word table.
        if res:
            # Extract the id for the supplied word.
            wordid = res['id']

            # Return the found entries as a list.
            res = []

            # Query the index_ref table for all the entries whose wordid
            # match the supplied word's id.
            self.c.execute("""select * from index_ref where wordid=?""",
                (wordid,))

            # Retrieve all the results of the query as a list.
            entries = self.c.fetchall()

            # For ever entry in the list.
            for row in entries:
                # Create a dictionary with the results and add the dictionary
                # to the list.
                res.append({
                    'id': row[0],
                    'word': self.manage_DB.get_index_word_info(row[1])['word'],
                    'docid': row[2],
                    'doc': self.manage_DB.get_document_info(row[2])['name'],
                    'line': row[3], 'column': row[4],
                    'branch_word': row[5]})

            # Return the list of all the results.
            return res
        # The supplied word does not exist in the index_word table, so return
        # and empty list.
        else:
            return []
Example #28
0
def tokenize_on_porter(text):
    word_list = []
    p = PorterStemmer()
    outfile = open('out3', 'w')
    for line in text.splitlines():
        output = ''
        word = ''
        if line != '':
            for c in line:
                if c.isalpha():
                    word += c.lower()
                else:
                    if word:
                        word_stem = p.stem(word, 0, len(word)-1)
                        output += word_stem
                        word_list.append(word_stem)
                        word = ''
                    output += c.lower()
        print(output, end='\n', file=outfile)
    outfile.close()
    return word_list
Example #29
0
    def make_cloud(self):
        stemdict, tempdict, finaldict = {}, {}, {}
        stopwords = open('stopwords.txt', 'r').read().split('\n')

        # Extract just the words inside quotes
        quotes = ' '.join(self.extract_quotes())
        wordlist = re.split('\s+', quotes.lower())
        
        p = PorterStemmer()
        punctuation = re.compile(r'[.?!,":;-]')

        # Stem all of the words in the word list using the Porter Stemmer
        for w in wordlist:
            w = punctuation.sub('', w)
            s = p.stem(w, 0,len(w)-1)
            try:
                tempdict[w] += 1
            except:
                tempdict[w] = 1
            stemdict.setdefault(s,{}).update({w:tempdict[w]})
        
        cumfreq = 0

        # Calculate the cumulative frequencies of the stemmed words
        for k, v in stemdict.items():
            for l, m in v.items():
                cumfreq = cumfreq + m
            items = v.items()
            items.sort(lambda x, y: cmp(y[1], x[1]))
            finaldict[items[0][0]] = cumfreq
            cumfreq = 0

        # Remove stopwords like "the", "it", "a", etc.
        for word in stopwords:
            try:
                del finaldict[word]
            except: pass

        results = self.process_cloud(8, finaldict.items()[:50])
        return results
Example #30
0
    def search(self, word, docid=None):
        # Create an instance of the Porter Stemmer.
        PS = PorterStemmer()

        # Get the information for the supplied word.
        res = self.manage_DB.get_info('index_word', where={
            'word': PS.stem(word, 0, len(word) - 1)})

        # The supplied word exist in the index_word table.
        if res:
            # Extract the id for the supplied word.
            wordid = res[0]['id']

            if docid:
                # Get all the entries in the index reference database that refer to
                # the supplied wordid.
                res = self.manage_DB.get_info('index_ref', where={
                    'wordid': wordid, 'docid': docid})
            else:
                # Get all the entries in the index reference database that refer to
                # the supplied wordid.
                res = self.manage_DB.get_info('index_ref', where={
                    'wordid': wordid})

            # For ever entry in the list.
            for row in res:
                # Modify the current row to contain the stem word.
                row['word'] =  self.manage_DB.get_info(
                    'index_word', rowid=row['wordid'])['word']
                # Modify the current row to contain the document name.
                row['doc'] = self.manage_DB.get_info(
                    'document', rowid=row['docid'])['name']

            # Return the list of all the results.
            return res
        # The supplied word does not exist in the index_word table, so return
        # and empty list.
        else:
            return []
Example #31
0
class SentencePorterStemStringFeature(object):
    def __init__(self):
        self._stemmer = PorterStemmer()

    def _stem(self, token):
        return self._stemmer.stem(token, 0, len(token) - 1)

    def get_id(self):
        return 'STRING-SENTENCE-STEM'

    def featurise(self, document, sentence, annotation):
        for token in sentence.text.split():
            yield (self._stem(token), 1)
Example #32
0
class SentencePorterStemStringFeature(object):
    def __init__(self):
        self._stemmer = PorterStemmer()

    def _stem(self, token):
        return self._stemmer.stem(token, 0, len(token) - 1)

    def get_id(self):
        return 'STRING-SENTENCE-STEM'

    def featurise(self, document, sentence, annotation):
        for token in sentence.text.split():
            yield (self._stem(token), 1)
Example #33
0
class StringPorterStemFeature(object):
    def __init__(self):
        self._stemmer = PorterStemmer()

    def _stem(self, token):
        return self._stemmer.stem(token, 0, len(token) - 1)
        return self._stemmer.stem(token, 0, len(token) - 1)

    def get_id(self):
        return 'STRING-STEM'

    def featurise(self, document, sentence, annotation):
        yield (self._stem(sentence.annotation_text(annotation)), 1)
Example #34
0
    def __init__(self):
        '''
        if phrase_dict_json != None: extract the phrase features
        if subtype_flag = True, extract the features by sub parse_type
        if bioe_flag = True, use the BIOE tags
        '''
        self.features = [
            'pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color'
        ]

        if 'pos' in self.features:
            self.pos_tagger = SennaTagger(global_params.sennadir)

        if 'chunk' in self.features:
            self.chunk_tagger = SennaChunkTagger(global_params.sennadir)

        self.sentences = []

        self.porter = PorterStemmer()

        self.token_dict = None
        self.bins = 50
Example #35
0
class StringPorterStemFeature(object):
    def __init__(self):
        self._stemmer = PorterStemmer()
        
    def _stem(self, token):
        return self._stemmer.stem(token, 0, len(token) - 1)
        return self._stemmer.stem(token, 0, len(token) - 1)

    def get_id(self):
        return 'STRING-STEM'

    def featurise(self, document, sentence, annotation):
        yield (self._stem(sentence.annotation_text(annotation)), 1)
Example #36
0
def getDocStuff(dDocProps):
    lAllLists = []

    if (constants.T in dDocProps):
        lAllLists.append(dDocProps[constants.T])
        putinDPLace("1",dDocProps[constants.T])
    if (constants.W in dDocProps):
        lAllLists.append(dDocProps[constants.W])
        putinDPLace("2",dDocProps[constants.W])
    if (constants.A in dDocProps):
        lAllLists.append(dDocProps[constants.A])
        putinDPLace("3",dDocProps[constants.A])

    lAllLines = []
    for lList in lAllLists:
        lAllLines.extend(lList)
    
    lAllWords = []
    for sLine in lAllLines:
        sLine = re.sub('[^a-zA-Z0-9]', ' ', sLine)
        lWords = sLine.lower().split()
        lAllWords.extend(lWords)
    lw = copy.deepcopy(lAllWords)
    lAllWords = helperFunctions.remStopWords(lAllWords)

    p = PorterStemmer()
    lAllWordsStemmed = []
    for word in lAllWords:
        word = p.stem(word,0,len(word)-1)
        lAllWordsStemmed.append(word)

    lUniqueWords = list(set(lAllWordsStemmed))
    lenAllWords = len(lAllWordsStemmed)
    constants.allDocsLen = constants.allDocsLen+lenAllWords
    sRet = helperFunctions.makeFixedLengthStr(len(lAllWordsStemmed),constants.docWordCntLen)+constants.space+helperFunctions.makeFixedLengthStr(len(lUniqueWords),constants.docWordCntLen)+constants.newLine

    return [sRet,lAllWordsStemmed," ".join(lw)]
Example #37
0
class Parser(object):
    """
    The parsing workhorse of the entire project.
    """

    def __init__(self, stop_words, **kwargs):
        """
        The constructor for the Parser object.

        @stop_words could be one a list of stop words, or None
        """
        super(Parser, self).__init__()
        # Checks if stop_words is a list
        if stop_words is not None:
            self.stop_words = []
            for word in stop_words:
                self.stop_words.append(word.lower())
        else:
            self.stop_words = None
        self.hashes = []
        self.documents = []
        self.num_duplicates = 0
        self.p = PorterStemmer()

    def retrieveText(self, page_soup, url):
        """
        Retrieves all the non-markup text from a webpage that
        has already been crawled.

        @page_soup: The soupified version of a webpage
        """
        # Retrieve all the text of the page minus the html tags
        page_text = page_soup.get_text()
        # Stems and returns all the non-stopword text
        stem_text = self.p.stemText(page_text, self.stop_words).encode('utf_8', 'ignore')
        # Create a hash to make sure there are no 100% duplicates in the pages
        # The hex digest will also be used as the document ID, since they will
        # be unique unless they are a duplicate
        h = hashlib.md5()
        h.update(stem_text)
        page_hash = h.hexdigest()
        # If the page is not a duplicate, add the hash to a list of found
        # hashes, and create a Document object to keep track of the information
        # for each Document
        if page_hash not in self.hashes:
            self.hashes.append(page_hash)
            self.documents.append(Document(stem_text, page_text, url, page_hash))
        else:
            self.num_duplicates += 1
Example #38
0
def getDocStuff(dDocProps):
    global T,W,B,A,N,I
    lAllLists = []
    if (T in dDocProps):
        lAllLists.append(dDocProps[T])
    if (W in dDocProps):
        lAllLists.append(dDocProps[W])
    #if (B in dDocProps):
    #    lAllLists.append(dDocProps[B])
    if (A in dDocProps):
        lAllLists.append(dDocProps[A])
    #if (N in dDocProps):
    #    lAllLists.append(dDocProps[N])

    lAllLines = []
    for lList in lAllLists:
        lAllLines.extend(lList)
    
    lAllWords = []
    for sLine in lAllLines:
        lWords = sLine.split()
        lAllWords.extend(lWords)

    lAllWords = helperFunctions.remStopWords(lAllWords)

    p = PorterStemmer()
    lAllWordsStemmed = []
    for word in lAllWords:
        word = p.stem(word,0,len(word)-1)
        lAllWordsStemmed.append(word)
    #print("All words :", lAllWordsStemmed,"\n")
    lUniqueWords = list(set(lAllWordsStemmed))
    lenAllWords = len(lAllWordsStemmed)
    lenAllWords
    sRet = makeFixedLengthStr(len(lAllWordsStemmed),6)+" "+makeFixedLengthStr(len(lUniqueWords),6) #+":"+dDocProps[B][0]
    return [sRet,lAllWordsStemmed]
    def process_query(self, query):
        all_doc_count = len(self.invert.documents.keys())
        query_array = [x.lower() for x in query.split(' ')]
        query_weights = {}
        stopwords = []
        if self.stopword_toggle:
            stopwords = fetch_stopwords()
        while query_array:
            word = query_array.pop(0)
            frequency = 1

            for a in [',', '.', '{', '}', '(', ')', ';', ':', '"', '\'']:
                if a in word:
                    if word.index(a) == 0 or word.index(a) == len(word) - 1:
                        word = word.replace(a, '')

            while word in query_array:
                query_array.pop(query_array.index(word))
                frequency += 1

            if self.stemming_toggle:
                p = PorterStemmer()
                word = p.stem(word, 0, len(word) - 1)

            if word in stopwords:
                continue

            term_weight = 0
            if word in self.invert.termsDictionary.keys():
                document_frequency = self.invert.termsDictionary[word]
                idf = math.log(all_doc_count / document_frequency)
                term_frequency = 1 + math.log(frequency)
                term_weight = idf * term_frequency

            query_weights[word] = term_weight
        return query_weights
Example #40
0
def main(argv):
    files = os.listdir(sys.argv[1])
    file = files[0]
    stemmed = []
    for file in files:
        text = ""
        infile = open(sys.argv[1] + file)
        a = infile.readline()
        while a:
            text += removeSGML(a)
            a = infile.readline()
        tok = tokenizeText(text)
        removed = removeStopwords(tok)
        from porter import PorterStemmer
        p = PorterStemmer()
        for element in removed:
            stemmed.append(p.stem(element, 0, len(element)-1))
    print "Words " + str(len(stemmed))
    unique = list(set(stemmed))
    print "Vocabulary " + str(len(unique))
    wordfrequency = [(unique[x], stemmed.count(unique[x])) for x in range(0,len(unique))]
    sort = sorted(wordfrequency, key = getKey, reverse = True)
    for i in range(0,49):
        print sort[i]
Example #41
0
    def __init__(self, stop_words, **kwargs):
        """
        The constructor for the Parser object.

        @stop_words could be one a list of stop words, or None
        """
        super(Parser, self).__init__()
        # Checks if stop_words is a list
        if stop_words is not None:
            self.stop_words = []
            for word in stop_words:
                self.stop_words.append(word.lower())
        else:
            self.stop_words = None
        self.hashes = []
        self.documents = []
        self.num_duplicates = 0
        self.p = PorterStemmer()
Example #42
0
def initBooleanQuery():
	#start_time = time.time()
	term2tidFile = open("term2tid.json", "r")
	indexFile = open("invertedIndex.json", "r")

	global term2id
	term2id = json.load(term2tidFile)
	global invertedIndex
	invertedIndex = json.load(indexFile)
	#print("--- %s seconds ---" % (time.time() - start_time))
	
	term2tidFile.close()
	indexFile.close()

	global wholeList
	wholeList = range(utils.docCount)
	global wholeLen
	wholeLen = utils.docCount
	global p
	p = PorterStemmer()
 def __init__(self):
     '''
     if phrase_dict_json != None: extract the phrase features
     if subtype_flag = True, extract the features by sub parse_type
     if bioe_flag = True, use the BIOE tags
     '''
     self.features = ['pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color']
     
     if 'pos' in self.features:
         self.pos_tagger = SennaTagger(global_params.sennadir)
     
     if 'chunk' in self.features:
         self.chunk_tagger = SennaChunkTagger(global_params.sennadir)
     
     self.sentences = []
     
     self.porter = PorterStemmer()
     
     self.token_dict = None
     self.bins = 50
Example #44
0
    def build(self, docpath, outfile):
        p = PorterStemmer()
        sw = stopwords.StopWords(self.stopword_file)

        ndx = defaultdict(list)

        for filename in os.listdir(docpath):
            if not filename.endswith(".txt"): continue

            doc_id = hash(filename.replace(".txt", ""))
            with open(os.path.join(docpath, filename)) as f:
                f_content = kwutils.normalize(f.read().lower())

            words = kwutils.tokenize(f_content)
            w_stemmed = kwutils.stem(words, p)
            w_stopped = kwutils.filter_stopwords(w_stemmed, sw)

            for word in w_stopped:
                if len(word) > 0:
                    if not doc_id in ndx[word]:
                        ndx[word].append(doc_id)

        with open(outfile, 'w') as f:
            f.write(json.dumps(ndx))
class CRF_Extractor:
    '''
    extract features for the CRF model
    each line is a feature vector for a token
    '''
    def __init__(self):
        '''
        if phrase_dict_json != None: extract the phrase features
        if subtype_flag = True, extract the features by sub parse_type
        if bioe_flag = True, use the BIOE tags
        '''
        self.features = ['pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color']
        
        if 'pos' in self.features:
            self.pos_tagger = SennaTagger(global_params.sennadir)
        
        if 'chunk' in self.features:
            self.chunk_tagger = SennaChunkTagger(global_params.sennadir)
        
        self.sentences = []
        
        self.porter = PorterStemmer()
        
        self.token_dict = None
        self.bins = 50
    
    def add_sentence(self, sentence):
        self.sentences.append(sentence)
    
    def get_token_tf(self):
        self.token_dict = defaultdict(float)
        
        for tokens, _, _ in self.sentences:
            for token in self.porter.stem_tokens(tokens):
                self.token_dict[token] += 1.0
        
        self.rank_dict = defaultdict(int)
        rank_tokens = sorted(self.token_dict, key=self.token_dict.get, reverse=True)
        
        self.rank_dict = defaultdict(int)
        for i, token in enumerate(rank_tokens):
            self.rank_dict[token] = int(i*10/len(rank_tokens))
        
        for t, v in self.token_dict.items(): #normalized by the number of sentences
            x = v/len(self.sentences)
            if x > 1.0: x = 1.0
            
            self.token_dict[t] = x
        
    def get_feature_names(self):
        return '_'.join(self.features)
    
    def get_i_j(self, body, i, j):
        '''
        return the value of the crf template feature u[i, j]
        intput: 
            body: [][], two-dimentionary array, representing the crf features for a sentence
            i: int, the index of i
            j: int, the index of j
        '''
        n = len(body)
        if i < 0:
            v = '_x%d'%(i)
        elif i >= n:
            v = '_x+%d'%(i-n+1)
        else:
            v = body[i][j]
        return v
    
    def extract_U_i_j(self, data_body, feature_body, i, j, tag):
        '''
        extract the U[i, j] feature, and add it to the end of each row
        intput: 
            data_body: [][], two-dimentionary array, representing the crf data for a sentence
            feature_body: [][], two-dimentionary array, the resulting feature data for a sentence
            i: int, the index of i
            j: int, the index of j
            tag: the prefix of the feature name
        '''
        for k, row in enumerate(feature_body):
            row.append('%s:%s'%(tag, self.get_i_j(data_body, k+i, j)))
    
    def extract_U_i_j_m_n(self, data_body, feature_body, i, j, m, n, tag):
        '''
        extract the U[i, j]/U[m, n] feature, and add it to the end of each row
        '''
        for k, row in enumerate(feature_body):
            row.append('%s:%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n)))
    
    def extract_U_i_j_m_n_x_y(self, data_body, feature_body, i, j, m, n, x, y, tag):
        '''
        extract the U[i, j]/U[m, n]/U[x,y] feature, and add it to the end of each row
        '''
        for k, row in enumerate(feature_body):
            row.append('%s:%s/%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n), self.get_i_j(data_body, k+x, y)))
    
    def extract_word_U_i_j(self, data_body, index, feature_body, i, j, tag):
        '''
        extract the U[i, j] feature, and add it to the end of each row
        '''
        for k, row in zip(index, feature_body):
            row.append('%s:%s'%(tag, self.get_i_j(data_body, k+i, j)))
    
    def extract_word_U_i_j_m_n(self, data_body, index, feature_body, i, j, m, n, tag):
        '''
        extract the U[i, j]/U[m, n] feature, and add it to the end of each row
        '''
        for k, row in zip(index, feature_body):
            row.append('%s:%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n)))
    
    def extract_word_U_i_j_m_n_x_y(self, data_body, index, feature_body, i, j, m, n, x, y, tag):
        '''
        extract the U[i, j]/U[m, n] feature, and add it to the end of each row
        '''
        for k, row in zip(index, feature_body):
            row.append('%s:%s/%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n), self.get_i_j(data_body, k+x, y)))
            
    def extract_bigram(self, body):
        '''
        extract the bigram feature for the crf template
        '''
        for row in body:
            row.append('b')
    
    def extract_crf_features(self, tokens, tags, prompt, colors=None):
        '''
        Extract the character features, each token a line
        return: [][], two dimentionary array, representing the feature data of the sentence
        '''
    
        body = []

        words = tokens
        N = len(tokens)
        
        #first row: the word token
        for word in words:
            row = []
            row.append(word)
            body.append(row)
        
        if 'pos' in self.features:
            pos_tags = self.pos_tagger.tag(tokens)
            
            for i, (_, p_tag) in enumerate(pos_tags):
                body[i].append(p_tag)
        
        if 'chunk' in self.features:
            chunk_tags = self.chunk_tagger.tag(tokens)
            
            for i, (_, c_tag) in enumerate(chunk_tags):
                body[i].append(c_tag)
        
        if 'promptword' in self.features:
            for i, token in enumerate(tokens):
                if token in prompt_words[prompt]:
                    body[i].append('Y')
                else:
                    body[i].append('N')
        
        if 'stopword' in self.features:
            for i, token in enumerate(tokens):
                if token in stopwords:
                    body[i].append('Y')
                else:
                    body[i].append('N')
        
        if 'tf' in self.features:
            if self.token_dict == None:
                self.get_token_tf()
            
            for i, token in enumerate(self.porter.stem_tokens(tokens)):
                assert(token in self.token_dict)
                
                x = int(self.token_dict[token]*self.bins)
                body[i].append(str(x))
        
        if 'rank' in self.features:
            if self.rank_dict == None:
                self.get_token_tf()
            
            for i, token in enumerate(self.porter.stem_tokens(tokens)):
                assert(token in self.rank_dict)
                
                x = self.rank_dict[token]
                body[i].append(str(x))        
        
        if 'color' in self.features and colors != None:
            for color in colors:
                for i, tag in enumerate(tags):
                    body[i].append(str(color[i]))
        
        #last row:
        tags = [tag for tag in tags]
        
        for i, tag in enumerate(tags):
            body[i].append(tag)
        
        return body
Example #46
0
poslineedited = []
neglinesedited = []


#there are total 6397 positives and negatives.
poslinesTrain= poslines[:3201]
neglinesTrain= neglines[:3196]

priorknowledgepo = []
priorknowledgeneg = []

priorknowledgeneg= 3196/ 6397
priorknowledgepo = 3201/ 6397


stemmer = PorterStemmer()
model = open('F:/ifa/NaiveBayes/model_file.csv', 'w',encoding="utf8")


trainset= [(x,1) for x in poslinesTrain] + [(x,-1) for x in neglinesTrain]
poswords={} #this dictionary stores counts for every word in positives
negwords={} #and negatives

for line,label in trainset: 
    words= getwords(line)

    for word in words:   
        word.lower()     
        #increment the counts for this word based on the label
        #the .get(x, 0) method returns the current count for word 
        #x, of 0 if the word is not yet in the dictionary
Example #47
0
class CRF_Extractor:
    '''
    extract features for the CRF model
    each line is a feature vector for a token
    '''
    def __init__(self):
        '''
        if phrase_dict_json != None: extract the phrase features
        if subtype_flag = True, extract the features by sub parse_type
        if bioe_flag = True, use the BIOE tags
        '''
        self.features = [
            'pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color'
        ]

        if 'pos' in self.features:
            self.pos_tagger = SennaTagger(global_params.sennadir)

        if 'chunk' in self.features:
            self.chunk_tagger = SennaChunkTagger(global_params.sennadir)

        self.sentences = []

        self.porter = PorterStemmer()

        self.token_dict = None
        self.bins = 50

    def add_sentence(self, sentence):
        self.sentences.append(sentence)

    def get_token_tf(self):
        self.token_dict = defaultdict(float)

        for tokens, _, _ in self.sentences:
            for token in self.porter.stem_tokens(tokens):
                self.token_dict[token] += 1.0

        self.rank_dict = defaultdict(int)
        rank_tokens = sorted(self.token_dict,
                             key=self.token_dict.get,
                             reverse=True)

        self.rank_dict = defaultdict(int)
        for i, token in enumerate(rank_tokens):
            self.rank_dict[token] = int(i * 10 / len(rank_tokens))

        for t, v in self.token_dict.items(
        ):  #normalized by the number of sentences
            x = v / len(self.sentences)
            if x > 1.0: x = 1.0

            self.token_dict[t] = x

    def get_feature_names(self):
        return '_'.join(self.features)

    def get_i_j(self, body, i, j):
        '''
        return the value of the crf template feature u[i, j]
        intput: 
            body: [][], two-dimentionary array, representing the crf features for a sentence
            i: int, the index of i
            j: int, the index of j
        '''
        n = len(body)
        if i < 0:
            v = '_x%d' % (i)
        elif i >= n:
            v = '_x+%d' % (i - n + 1)
        else:
            v = body[i][j]
        return v

    def extract_U_i_j(self, data_body, feature_body, i, j, tag):
        '''
        extract the U[i, j] feature, and add it to the end of each row
        intput: 
            data_body: [][], two-dimentionary array, representing the crf data for a sentence
            feature_body: [][], two-dimentionary array, the resulting feature data for a sentence
            i: int, the index of i
            j: int, the index of j
            tag: the prefix of the feature name
        '''
        for k, row in enumerate(feature_body):
            row.append('%s:%s' % (tag, self.get_i_j(data_body, k + i, j)))

    def extract_U_i_j_m_n(self, data_body, feature_body, i, j, m, n, tag):
        '''
        extract the U[i, j]/U[m, n] feature, and add it to the end of each row
        '''
        for k, row in enumerate(feature_body):
            row.append('%s:%s/%s' % (tag, self.get_i_j(
                data_body, k + i, j), self.get_i_j(data_body, k + m, n)))

    def extract_U_i_j_m_n_x_y(self, data_body, feature_body, i, j, m, n, x, y,
                              tag):
        '''
        extract the U[i, j]/U[m, n]/U[x,y] feature, and add it to the end of each row
        '''
        for k, row in enumerate(feature_body):
            row.append('%s:%s/%s/%s' % (tag, self.get_i_j(
                data_body, k + i, j), self.get_i_j(
                    data_body, k + m, n), self.get_i_j(data_body, k + x, y)))

    def extract_word_U_i_j(self, data_body, index, feature_body, i, j, tag):
        '''
        extract the U[i, j] feature, and add it to the end of each row
        '''
        for k, row in zip(index, feature_body):
            row.append('%s:%s' % (tag, self.get_i_j(data_body, k + i, j)))

    def extract_word_U_i_j_m_n(self, data_body, index, feature_body, i, j, m,
                               n, tag):
        '''
        extract the U[i, j]/U[m, n] feature, and add it to the end of each row
        '''
        for k, row in zip(index, feature_body):
            row.append('%s:%s/%s' % (tag, self.get_i_j(
                data_body, k + i, j), self.get_i_j(data_body, k + m, n)))

    def extract_word_U_i_j_m_n_x_y(self, data_body, index, feature_body, i, j,
                                   m, n, x, y, tag):
        '''
        extract the U[i, j]/U[m, n] feature, and add it to the end of each row
        '''
        for k, row in zip(index, feature_body):
            row.append('%s:%s/%s/%s' % (tag, self.get_i_j(
                data_body, k + i, j), self.get_i_j(
                    data_body, k + m, n), self.get_i_j(data_body, k + x, y)))

    def extract_bigram(self, body):
        '''
        extract the bigram feature for the crf template
        '''
        for row in body:
            row.append('b')

    def extract_crf_features(self, tokens, tags, prompt, colors=None):
        '''
        Extract the character features, each token a line
        return: [][], two dimentionary array, representing the feature data of the sentence
        '''

        body = []

        words = tokens
        N = len(tokens)

        #first row: the word token
        for word in words:
            row = []
            row.append(word)
            body.append(row)

        if 'pos' in self.features:
            pos_tags = self.pos_tagger.tag(tokens)

            for i, (_, p_tag) in enumerate(pos_tags):
                body[i].append(p_tag)

        if 'chunk' in self.features:
            chunk_tags = self.chunk_tagger.tag(tokens)

            for i, (_, c_tag) in enumerate(chunk_tags):
                body[i].append(c_tag)

        if 'promptword' in self.features:
            for i, token in enumerate(tokens):
                if token in prompt_words[prompt]:
                    body[i].append('Y')
                else:
                    body[i].append('N')

        if 'stopword' in self.features:
            for i, token in enumerate(tokens):
                if token in stopwords:
                    body[i].append('Y')
                else:
                    body[i].append('N')

        if 'tf' in self.features:
            if self.token_dict == None:
                self.get_token_tf()

            for i, token in enumerate(self.porter.stem_tokens(tokens)):
                assert (token in self.token_dict)

                x = int(self.token_dict[token] * self.bins)
                body[i].append(str(x))

        if 'rank' in self.features:
            if self.rank_dict == None:
                self.get_token_tf()

            for i, token in enumerate(self.porter.stem_tokens(tokens)):
                assert (token in self.rank_dict)

                x = self.rank_dict[token]
                body[i].append(str(x))

        if 'color' in self.features and colors != None:
            for color in colors:
                for i, tag in enumerate(tags):
                    body[i].append(str(color[i]))

        #last row:
        tags = [tag for tag in tags]

        for i, tag in enumerate(tags):
            body[i].append(tag)

        return body
class BuildTermSpace(object):

	"""
	Создание json-объекта в виде словаря, в котором хранятся стеммы
	значимых слов и их частотность из указанных корпусов.
	"""

	def __init__(self, language='en', action='tfidf'):

		# Вызываем LoadExternalLists, создаем список стоп-слов, 
		# загружаем немецкий лексикон,
		# 
		self.language = language
		self.action = action

		# знаки, которые будут удаляться в начале и конце токена
		self.punctuation = "∙!‼¡\"#£€$¥%&'()*+±×÷·,-./:;<=>?¿@[\]^ˆ¨_`—–­{|}~≈≠→↓¬’“”«»≫‘…¦›🌼′″¹§¼⅜½¾⅘©✒•►●★❤➡➜➚➘➔✔➓➒➑➐➏➎➍➌➋➊❸❷■†✝✌️³‎²‚„ ​"

		loadRes = LoadExternalLists()

		if self.language == 'de':
			self.stopwords = loadRes.loadStopWordsDE()
			# объект стеммера
			self.stemmer = GermanStemmer()
			# немецкий словарь
			print '\n', "Loading German Dictionary... OK", '\n'
			self.lexicon_de = loadRes.loadLexiconDe()
			self.normalizer = NormalizerDE()
		elif self.language == 'ru':
			self.stopwords = loadRes.loadStopWordsRU()
			self.stemmer = RussianStemmer()
			# объект pymorphy2.MorphAnalyzer(), будем использовать атрибут normal_form
			self.lemmatizer_ru = pymorphy2.MorphAnalyzer()
			self.normalizer = NormalizerRU()
		else:
			self.stopwords = loadRes.loadStopWordsEN()
			self.stemmer = PorterStemmer()
			self.normalizer = NormalizerEN()
			# список неправ. гл.
			self.irreg_verbs = loadRes.loadVerbForms()
			# список неправ. сущ-х
			self.irreg_nouns = loadRes.loadNounforms()
	

	def processString(self, line):
		"""
		Функция последовательной обработки каждого слова. Получает на вход строку, создает список tokens,
		складывает туда выделенные re.split'ом слова, 'отрезая' пунктуацию с концов слова и понижая регистр, 
		и удаляет по ходу окончания-сокращения функцией del_contractions
		Дальше переходит к формированию списка стеммированных терминов rslt_list с удалением стоп-слов
		и цифровых последовательностей.
		Возвращает список rslt_list, в котором содержатся только стеммы значимых слов.
		"""

		# для разбивки на токены по пробелам и слешам
		splitchars = re.compile(r'[\s\\\/\(\)\[\]\<\>\;\:\,\‚\—\?\!\|\"«»…#]|\.\.\.+|[ �⌂ ∞½¾►=]|\-\-|\.[\'\"’“”«»‘′″„-]') # [\.\:][\'\"’“”«»‘′″]

		# для игнорирования токенов, содержащих цифры
		esc_num = re.compile(r'[0-9]+')

		# для игнорирования URL
		#url_esc = re.compile(r'([a-z]{3,6}:\/\/)?([a-zA-Z0-9\-@?]+[\.|\:])+[a-z]{2,13}[\.\?\=\&\%\,\#\+\(\)\/\w\-]*')

				
		if self.language == 'de':
			tokens = (self.normalizer.normalizeUmlaut(self.normalizer.deleteContrs(token.strip(self.punctuation).lower())) for token in splitchars.split(line))
			rslt_list = (self.stemmer.stem(self.normalizer.lemmatize(term, self.lexicon_de)) for term in tokens if term not in self.stopwords and not esc_num.search(term) and len(term)>0)	# and not esc_num.search(term) - включить после услоия на стоп-слова, если нужно удалять токены с цифрами

		elif self.language == 'ru':
			tokens = (self.normalizer.normalizeE(token.strip(self.punctuation).lower()) for token in splitchars.split(line))
			rslt_list = (self.stemmer.stem(self.lemmatizer_ru.parse(term)[0].normal_form) for term in tokens if term not in self.stopwords and not esc_num.search(term) and len(term)>0)	# and not esc_num.search(term) - включить после услоия на стоп-слова, если нужно удалять токены с цифрами

		else:
			# генератор списка токенов: по циклу: разбиваем строку на токены по regexp splitchars,
			# 2. удаляем знаки вокруг токена, приводим к нижнему регистру, 
			# 3. трансформируем форму неправ. глаг. в правильную
			# 4. удаляем окончания-сокращения с \'
			tokens = (self.normalizer.token_transform(self.normalizer.del_contractions(token.strip(self.punctuation).lower()), self.irreg_verbs, self.irreg_nouns) for token in splitchars.split(line))

			# генератор списка терминов: если термин не в списке стоп-слов и не содержит цифр, то стеммируем его.
			rslt_list = (self.stemmer.stem(term, 0, len(term)-1) for term in tokens if term not in self.stopwords and not esc_num.search(term) and len(term)>0)	# and not esc_num.search(term) - включить после услоия на стоп-слова, если нужно удалять токены с цифрами
		

		if not rslt_list:
			return []

		else:
			return rslt_list


	def processFile(self, filename):
		"""
		Читает файл в utf-16, для каждой строки файла вызывает функцию processString,
		каждое слово из получившегося списка добавляет в set terms_set, избавляясь от
		дубликатов.
		Возвращает список уникальных лемм.
		"""

		terms_set = set()
		terms_list = []

		if self.action == 'tfidf':
			try:
				with codecs.open(filename, 'r', 'utf-16') as infile:
					
					for line in infile:
						if len(line) > 1:
							for term in self.processString(line):
								terms_set.add(term)
								
			except (UnicodeDecodeError, UnicodeError, IOError):
				pass

			return terms_set

		if self.action == 'raw':
			try:
				with codecs.open(filename, 'r', 'utf-16') as infile:
					
					for line in infile:
						if len(line) > 1:
							for term in self.processString(line):
								terms_list.append(term)
								
			except (UnicodeDecodeError, UnicodeError, IOError):
				pass

			return terms_list


	def crawl(self, dirname):
		"""
		Функция проходит по папкам и подпапкам указанной в качестве аргумента директории.
		Проверяет, если файл текстовый, то запускает функцию processFile и складывает результат
		её работы в set terms_set.
		В общем terms_dict подсчитывается частотность каждой леммы, словарь сохраняется как json.
		terms_dict отражает по сути вторую часть формулы tfidf, т.е. показывает в каком количестве
		документов встретился термин.
		"""

		docs_num = 0

		terms_dict = defaultdict(int)
		
		for root, dirs, files in os.walk(dirname):

			print root, "processing..."
			
			for filename in files:

				if filename.endswith('.txt') or filename.endswith('.TXT'):
					
					print filename

					terms_set = self.processFile(join(root,filename))

					for term in terms_set:

						terms_dict[term] += 1

					docs_num+=1
					
		if self.action == 'raw':
			with codecs.open(r'.\termSpace\\'+self.language.upper()+'frequency_list_stem.txt', 'w', 'utf-16') as outfile:
				for key, value in sorted(terms_dict.iteritems(), key=lambda x:x[1], reverse=True):
					outfile.write(key+'\t'+str(value))
					outfile.write('\n')
		
		if self.action == 'tfidf':

			with open(r".\termSpace\\" + self.language.upper() + "CorpusDict_" + str(docs_num) + ".json", 'w') as  outfile:
				json.dump(terms_dict, outfile)
Example #49
0
# MIT Licensed
# Copyright 2014 REM <*****@*****.**>.

from pymongo import MongoClient
from pymongo import DESCENDING
import utility
from porter import PorterStemmer

p = PorterStemmer()

client = MongoClient('localhost', 27017)
db = client.uberly
clt = db.uber_vocab_1

#db.uber_dictionary.find().limit(50).sort({value:-1}).pretty()
for entry in clt.find().sort([('value', DESCENDING)]):
  entry['stem'] = p.stem(entry['_id'], 0,len(entry['_id'])-1)
  clt.save(entry)
  print(entry['_id']), entry['stem']
  
Example #50
0
def stem(word):
    p = PorterStemmer()
    return p.stem(word, 0, len(word) - 1)
Example #51
0
import re
from porter import PorterStemmer
p = PorterStemmer()


def lcase(text):
    return text.lower()


def prefixes(text):
    return [text[:3], text[:4], text[:5]]


def suffixes(text):
    return [text[-3:], text[-4:], text[-5:]]


def stem(text):
    if text.isalpha():
        return p.stem(text.lower(), 0, len(text) - 1)
    return text


def is_pair_of_digits(text):
    if re.match("^[0-9]{2}$", text):
        return True
    return False


def is_four_digits(text):
    if re.match("^[0-9]{4}$", text):
Example #52
0
                if(lData[0] == query):
                    return lData[1:]
                elif (query < lData[0]):
                    print (query+ " < "+lData[0])
                    return search(query,index+1,middle,m1)
                elif (query > lData[0]):
                    print (query+ " > "+lData[0]+" so go to  index : "+str(index+middle))
                    return search(query,index+middle,middle,m1)
                
                
                else:
                    return []


        print(" size : "+str(int(m1.size()/35)))
        p = PorterStemmer()
        query= sys.argv[1]
        query = p.stem(query,0,len(query)-1)
           
        lBytes = search(query,0,int(m1.size()/35),m1)
        print("lBytes : ",lBytes)

        if(lBytes != None):
            termFile = "./indexes/terms.txt"
            with open(termFile, "r+b") as f:
                # memory-map the file, size 0 means whole file
                map = mmap.mmap(f.fileno(), 0)
                print("Term stuff :  ",map[int(lBytes[0]):int(lBytes[1])])

        else:
            print("Not found")
Example #53
0
 def __init__(self):
     _LanguageSpecificStemmer.__init__(self)
     PorterStemmer.__init__(self)
Example #54
0
 def __init__(self):
     self.p = PorterStemmer()
     self.sw = stopwords.StopWords(self.stopword_file)
     self.re_tag = RE_TAG
     self.index = None
Example #55
0
dictionarySo={}
dictionaryProbComPo={}
weightPo=0
weightSo=0
dictionaryProbIndPo={}
dictionaryProbIndSo={}
removeWord=[]
#read in training data lines from files, and stopwords (useless words)
f=open("training.txt");
v=open("test.txt");
dataFile = open("temp1.txt", "w")
comDataFile = open("com.txt", "w")
testFile = open("output.txt", "w")
stopWord=open("stopwords.txt").read()
stopWord=stopWord.split("\n")
stemmer= PorterStemmer()
countPo=0
countSo=0
trainingSet=f.readlines()
testingSet=v.readlines()
trainingSet=trainingSet
testingSet=testingSet

#initialize the stemmer object for (optional) stemming later
stemmer= PorterStemmer()
stopWord=stemmer.stem(stopWord,0,len(stopWord)-1)

def getCleanString(string):
        """
                fix the string for best results
		the cleaning involve 
Example #56
0
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t')
print("dataset imported")

import re

import nltk

nltk.download('stopwords')

# to remove stopword
from nltk.corpus import stopwords

# for Stemming propose
#from nltk.stem.porter import PorterStemmer
from porter import PorterStemmer
p = PorterStemmer()
p.stem("Alcoholic")

# Initialize empty array
# to append clean text
corpus = []
for i in range(0, 1000):
    # column : "Review", row ith
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    # convert all cases to lower cases
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()

    # loop for stemming each word
    # in string array at ith row
Example #57
0
 def __init__(self):
     self._stemmer = PorterStemmer()
Example #58
0
 def train():  
	poslines = []
	neglines = []

	stopwords= open(r'stopwords.txt', 'r').read().splitlines()
	dataset= open('training_set.csv', 'r',encoding="utf8")

	dataset.readline()

	poslines=[]
	neglines=[]

	for data in dataset:
		data.lower()
		datalines = data.split(",")[1].strip('"').split(' ')
		DataClass = data.split(",")[0]
		#tokenizing the sentence
		if int(DataClass)==0:
			poslines.append(datalines)  
		if int(DataClass)==1:
			neglines.append(datalines)
		else:
			continue
	print( "The total positive words are:", len(poslines))
	print ("The total negative words are: ", len(neglines))

	poslineedited = []
	neglinesedited = []


	#there are total 6397 positives and negatives.
	poslinesTrain= poslines[:3201]
	neglinesTrain= neglines[:3196]

	priorknowledgepo = []
	priorknowledgeneg = []

	priorknowledgeneg= 3196/ 6397
	priorknowledgepo = 3201/ 6397


	stemmer = PorterStemmer()
	model = open('model_file.csv', 'w',encoding="utf8")


	trainset= [(x,1) for x in poslinesTrain] + [(x,-1) for x in neglinesTrain]
	poswords={} #this dictionary stores counts for every word in positives
	negwords={} #and negatives

	for line,label in trainset: 
		words= getwords(line)

		for word in words:   
			word.lower()     
			#increment the counts for this word based on the label
			#the .get(x, 0) method returns the current count for word 
			#x, of 0 if the word is not yet in the dictionary
			if label==1: poswords[word]= poswords.get(word, 0) + 1
			if label==-1: negwords[word]= negwords.get(word, 0) + 1
	positivewordlist = open(r'positive-words.txt', 'r').read().splitlines()
	negativewordlist = open(r'negative-words.txt', 'r').read().splitlines()

	#evaluate the test set
	testset= open('test_set.csv', 'r',encoding="utf8")
	testset.readline()           
	#make predictions
	output = open("prediction_file.csv", 'w')

	for line in testset:
		linesplit = line.split()
		testwords= getwords(linesplit)
		totpos, totneg= 0.0, 0.0
		for word in testwords:
			word.lower()
			
			a= poswords.get(word,0.0) + 1.0
			b= negwords.get(word,0.0) + 1.0 
			totpos+= a/(a+b)
			totneg+= b/(a+b) 
			model.write("Word: " +str(word) + ",")
			model.write("Relative positive usage: " + str(totpos)+ ",")
			model.write("Relative negative usage: "+str(totneg)+ '\n')
Example #59
0
 def __init__(self, documents, NDC, stop_words):
     super(Pepper, self).__init__()
     self.documents = documents
     self.NDC = NDC
     self.p = PorterStemmer()
     self.stop_words = stop_words
Example #60
0
def processEmail(email_contents):
    #PROCESSEMAIL preprocesses a the body of an email and
    #returns a list of word_indices
    #   word_indices = PROCESSEMAIL(email_contents) preprocesses
    #   the body of an email and returns a list of indices of the
    #   words contained in the email.
    #

    # Load Vocabulary
    vocab = getVocabDict()

    # Init return value
    word_indices = []

    # ========================== Preprocess Email ===========================

    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers

    # hdrstart = email_contents.find('\n\n')
    # email_contents = email_contents[hdrstart+2:]

    # Lower case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with >
    # and does not have any < or > in the tag and replace it with a space
    email_contents = re.sub('<[^<>]+>', ' ', email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.sub('[0-9]+', 'number', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents)

    # Handle $ sign
    email_contents = re.sub('[$]+', 'dollar', email_contents)


    # ========================== Tokenize Email ===========================

    # Output the email to screen as well
    print '\n==== Processed Email ====\n'

    # Process file
    l = 0
    porterStemmer = PorterStemmer()
    # Tokenize and also get rid of any punctuation
    sep = '[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\},\'\"\>\_\<\;\%\n\r]+'
    for s in re.split(sep, email_contents):
        # Remove any non alphanumeric characters
        s = re.sub('[^a-zA-Z0-9]', '', s)

        # Stem the word
        s = porterStemmer.stem(s.strip())

        # Skip the word if it is too short
        if len(s) < 1:
           continue

        # Look up the word in the dictionary and add to word_indices if
        # found
        # ====================== YOUR CODE HERE ======================
        # Instructions: Fill in this function to add the index of str to
        #               word_indices if it is in the vocabulary. At this point
        #               of the code, you have a stemmed word from the email in
        #               the variable s. You should look up s in the
        #               vocabulary dictionary (vocab). If a match exists, you
        #               should add the index of the word to the word_indices
        #               vector. Concretely, if s = 'action', then you should
        #               add to word_indices the value under the key 'action'
        #               in vocab. For example, if vocab['action'] = 18, then,
        #               you should add 18 to the word_indices vector
        #               (e.g., word_indices.append(18) ).
        #




        # =============================================================


        # Print to screen, ensuring that the output lines are not too long
        if l + len(s) + 1 > 78:
            print
            l = 0
        print s,
        l += len(s) + 1

    # Print footer
    print '\n========================='

    return array(word_indices)