Ejemplo n.º 1
0
 def __init__(self, id, context):
     self.id = id
     self.words = []
     self.context = [context]
     self.stemmer = stemmer.PorterStemmer()
     self.feature = []
     self.items = {}
Ejemplo n.º 2
0
def indexText(text):
    # prepare text
    lines = text.split('.')
    clean_lines = [line.strip() for line in lines if line.strip()]
    newtext = '\n'.join(clean_lines)
    words = textmining.simple_tokenize(newtext)
    p = stemmer.PorterStemmer()
    # filter stop words
    text = open('stopwords.txt').read()
    stopwords = textmining.simple_tokenize(text)
    # use stemming
    stemmed = []
    freq = {}
    occur = {}
    for index, w in enumerate(words):
        stem = p.stem(w, 0, len(w) - 1)
        stemmed.append(stem)
        if stem not in stopwords:
            freq[stem] = stemmed.count(stem)
            occur[stem] = w
    sorted_freq = sorted(freq.iteritems(),
                         key=operator.itemgetter(1),
                         reverse=True)
    # Concordance
    most_freq_words = sorted_freq[:1]
    print "------Index-----"
    print occur[most_freq_words.pop()[0]]
    print "----------------"
    return occur
Ejemplo n.º 3
0
def freq_analysis(conn, cursor, cuisine, ingredients_list):
    """Performs analysis on a cuisine's list of 100 recipes.
     Finds term frequencies. Each ingredient is stemmed and
     checked against a stopwords list. Term frequencies are
     store in a SQLite db with their associated cuisine type.
  """

    stopwords = [word[:-1] for word in open('stopwords.txt', 'r')]
    pstemmer = stemmer.PorterStemmer()

    freq = defaultdict(lambda: 1)
    # used to map word stems to whole words
    mapping = defaultdict(list)

    for ingredients in ingredients_list:
        for ingredient in ingredients.split():
            ingredient = remove_punc(ingredient)
            if ingredient not in stopwords:
                ingredient_stem = pstemmer.stem(ingredient, 0,
                                                len(ingredient) - 1)
                freq[ingredient_stem] += 1
                mapping[ingredient_stem].append(ingredient)

    #for x,y in sorted(freq.iteritems(), key=lambda x: x[1], reverse=True):
    for ingred, freq in sorted(freq.iteritems(),
                               key=operator.itemgetter(1),
                               reverse=True):
        nonstemmed_ingred = mapping[ingred]
        # if multiple words map to the same stem, take the shortest one in length
        dbingred = min(nonstemmed_ingred, key=lambda candidate: len(candidate))
        insert_ingred(cursor, cuisine, dbingred, freq)
Ejemplo n.º 4
0
 def __init__(self):
     self.stemmer = stemmer.PorterStemmer()
     self.cleaner = Cleaner(style=True)
     self.stopWords = []
     if os.path.exists('stopWords'):
         self.stopWords = [line.strip() for line in open('stopwords')]
     else:
         self.stopWords = features.stopWords
 def _clean_word(self, word):
     word = word.lower()
     for punc in Document.PUNCTUATION + Document.CARRIAGE_RETURNS:
         word = word.replace(punc, '').strip("'")
         # stemmer: dogs -> dog ; created -> creat
         ps = stemmer.PorterStemmer()
         word = ps.stem(word, 0, len(word) - 1)
     return word if re.match(Document.WORD_REGEX, word) else None
def text_tokenizer(text):
	p = ps.PorterStemmer()
	tokenized = text.split()
	tokenized = [x.strip(bad_chars) for x in tokenized if '&#' not in x and x != '' and '<' not in x]
	for t in tokenized:
		if t not in terms_before_preprocess:
			terms_before_preprocess[t] = 1
		else:
			terms_before_preprocess[t] += 1
	return [p.stem(x.lower(), 0, len(x)-1) for x in tokenized]
Ejemplo n.º 7
0
    def __init__(self, storedb, logfunc=None):
        print 'Initializing ANEW module'

        self.store_db = storedb
        self.valence_db = ValenceDB()
        self.stemmer = stemmer.PorterStemmer()
        self.logfunc = logfunc
        self.total_count = 0

        # Connect to ANEW database
        self.valence_db.connect()
def stem(word):
    """
    Returns Porter stemmed version of words.
    Input can either be a string or list of strings.
    """
    p = stemmer.PorterStemmer()
    if isinstance(word, str):
        # Input is a single word
        return p.stem(word, 0, len(word) - 1)
    else:
        # Assume input is a list ot words
        return [p.stem(w, 0, len(w) - 1) for w in word]
Ejemplo n.º 9
0
def stemWords(tokenList):
	p = stemmer.PorterStemmer()
	stemmedList = []
	for word in tokenList: 
		prevWord = ""
		# stem the token until it doesn't change any more.
		while word != prevWord:
			prevWord = word;
			word = p.stem(word, 0, len(word) - 1)
		stemmedList.append(word);

	return stemmedList
Ejemplo n.º 10
0
 def normalizeText(self, text):
     text = text.lower()
     text = re.sub('[^0-9a-zA-Z]+', ' ', text)
     articleWords = text.split()
     articleWords = self.removeStopWords(articleWords)
     stemmedWords = []
     for word in articleWords:
         p = stemmer.PorterStemmer()
         stemmed = p.stemWord(word)
         self.reverseStemHashtable[stemmed] = word
         stemmedWords.append(stemmed)
     return stemmedWords
Ejemplo n.º 11
0
    def parser(self):
        """
      Here I use html5lib so parse the pages retrieved.
      I am using BeautifulSoup as my parser here and I know it 
      is deprecated.  I will change this soon...

      Content is taken only from <p> tags, so this could be a lot
      more robust. 

      All words are stemmed and stopwords are removed.
    """

        #get stopwords; remove newline char
        parsed_html = {}
        stopwords = [word[:-1] for word in open('stopwords.txt')]
        pstemmer = stemmer.PorterStemmer()

        htmldocs = os.listdir('pages/')  #grap all html docs and parse them
        words_splitter = re.compile(r'\W*')  #split on non words
        for htmldoc in htmldocs:
            f = open('pages/' + htmldoc, 'r')
            link = f.readline()
            html = f.readlines()

            try:
                print htmldoc
                p = html5lib.HTMLParser(
                    tree=treebuilders.getTreeBuilder('beautifulsoup'))
                tree = p.parse(html)
            except:
                os.remove(os.path.join('pages', htmldoc))
                print 'error parsing %s' % htmldoc
                continue

            title = tree.findAll('title')
            if title: title = title[0].text
            else: title = ''

            #grab text from p tags
            data = [p.text.lower() for p in tree.findAll('p')]
            #remove stopwords
            unstemmed_words = [
                word for word in words_splitter.split(''.join(data))
                if word != '' and word not in stopwords
            ]
            stemmed_words = [
                pstemmer.stem(word, 0,
                              len(word) - 1) for word in unstemmed_words
            ]
            parsed_html[(title, int(htmldoc), link)] = stemmed_words

        return parsed_html
Ejemplo n.º 12
0
def text_tokenizer(text, topic, isTrain, doc):
    p = ps.PorterStemmer()
    tokenized = [
        x.strip(bad_chars) for x in text.split()
        if '&#' not in x and x != '' and '&lt;' not in x
    ]
    tokenized = [
        p.stem(x.lower(), 0,
               len(x) - 1) for x in tokenized if x not in stopwords
    ]
    terms = collections.Counter(tokenized)
    if isTrain == 1:
        tokens_by_topic[topic] += tokenized
        add_to_dict(list(terms), doc)
    return terms
def stem(f):
    p = stemmer.PorterStemmer()

    infile = open(f, 'r')
    while 1:
        output = ''
        word = ''
        line = infile.readline()
        if line == '':
            break
        for c in line:
            if c.isalpha():
                word += c.lower()
            else:
                if word:
                    output += p.stem(word, 0, len(word) - 1)
                    word = ''
                output += c.lower()
        #print output
        des_filename = "pre_stem.txt"
        open(des_filename, 'w').writelines(output)
    infile.close()
Ejemplo n.º 14
0
def getWordsFrom(string): # get the list of words from './string' directory
    string = string.lower()
    wordList = []
    for root, dirs, files in os.walk('./'):
        if string in root.lower():
            for file in files:
                f = open(root+'/'+file, 'r')
                tmp = [ x.lower() for x in re.split('[^a-zA-Z]+',f.read())] 
                #Only select English words in the list
                tmp = filter(lambda a: (len(a) >= 2) & (a not in unusedWords), tmp) 
                #Remove non-useful words to detect spam or not
                wordList += tmp
                f.close()

    p = stemmer.PorterStemmer()
    for i in xrange(len(wordList)):
        wordList[i] = p.stem(wordList[i], 0,len(wordList[i])-1)
        
    wordList = list(set(wordList)) 
    #Remove overlapping words
    wordList.sort()
    return wordList    
Ejemplo n.º 15
0
 def __init__(self):
     self.stemmer = stemmer.PorterStemmer()
     self.stop_words = self.load_stop_words(
         os.path.join(os.path.abspath(os.path.dirname(__file__)),
                      'english.stop'))
Ejemplo n.º 16
0

def and_not_comp(set1, set2):
    #print '\t', set1, '-', set2
    return set2.difference(set1)


def clean(str):
    str = str.replace(',', '').replace('(', '').replace(')', '').replace(
        '\'', '').replace('"', '').replace(';', '').replace('.', '')
    return str


operators = {'&&': and_comp, '||': or_comp, '&^': and_not_comp}

stemmer = stemmer.PorterStemmer()


def index(id, doc):
    terms = doc.split()

    for term in terms:
        term = term.lower()
        term = clean(term)

        doc_ids = inverted_index.get(term)
        if doc_ids:
            doc_ids.add(id)
        else:
            inverted_index[term] = set()
            inverted_index[term].add(id)
Ejemplo n.º 17
0
import stemmer as ps
import pickle
import sys

dictionary = pickle.load(open("dictionary", "rb"))
posindex = pickle.load(open("posindex", "rb"))

p = ps.PorterStemmer()


def handle_conj_query(word_list):
    try:
        w1_id = dictionary[word_list[0]]
    except:
        return []
    doclist1 = list(posindex[w1_id].keys())
    for i in range(1, len(word_list)):
        if not doclist1:
            return []
        w2_id = dictionary[word_list[i]]
        doclist2 = list(posindex[w2_id].keys())
        doclist1 = [x for x in doclist1 if x in doclist2]
    return doclist1


def handle_phrase_query(word_list):

    return handle_prox_query(word_list, [0] * (len(word_list) - 1))


def handle_prox_query(word_list, prox_list):
Ejemplo n.º 18
0
 def __init__(self, imap):
     self.ps=stemmer.PorterStemmer()
     self.imap=imap
     self.db=psycopg.connect('dbname=imapindex host=db user=dustin ' + \
         'password=blahblah', serialize=0)
     self.c=self.db.cursor()