Example #1
0
def process_words(words):
    # Convert to lower case
    words = words.strip()
    words = words.lower()
    # replace abbreviations with full words and do word stemming
    splited = words.split(" ")
    processed_words = []
    tpes = []
    for word in splited:
        if word not in ['[addr]', '[logo]', '[supplier]']:
            word = word.strip(" ,._+=!@%^&*:;/?<>()[]{}|'").rstrip("$")
        if len(word) <= 1 and not word.isdigit() and word is not "#":
            continue
        tpe = get_type(word)
        if word in abbrev_dict:
            processed_word = stem(abbrev_dict[word])
        elif stem(word) in abbrev_dict:
            processed_word = stem(abbrev_dict[stem(word)])
        else:
            processed_word = stem(word)
        if (tpe != 'text'):
            processed_word = tpe
        if processed_word not in sw:
            processed_words.append(processed_word.strip("#"))
            tpes.append(tpe)
    return (' '.join(processed_words), ','.join(tpes))
def add_stemmered(word):
  porter_stemmered = open("porter_stemmered.txt", "w")
  for line in open(word):
    line = line.strip().split('\t')
    if line[0] != '':
      print (line[0] + '\t' + line[1] + '\t' + stem(line[1]))
      porter_stemmered.write(line[0] + '\t' + line[1] + '\t' + stem(line[1]) + '\n')

  porter_stemmered.close()
Example #3
0
def main():
    try:
        k = int(sys.argv[2])
        fileName = sys.argv[1]
        wordsDict = createWordsDict(fileName)
        wordsDict = sorted(wordsDict.items(), key=itemgetter(1),
                           reverse=True)[:k]

        # print '\nHi'
        for tuple in wordsDict:
            for token in tuple:
                print porter2.stem(token)
                break
    except:
        print "Error"
Example #4
0
def stem_phrase(phrase):
	words = phrase.split()
	for i in xrange(0, len(words)):
		words [i] = porter2.stem(words[i])
		
	return ' '.join(words)
		
Example #5
0
def getFeatures(attraction,title,bodyText,labels,stopwords): 
    features = defaultdict() 
    for word in attraction: 
        features[('attraction_word',word)]=1
    titleCount=0
    for word in title: 
        if word.lower() not in stopwords: 
            features[('title_word',word.lower())]=1
            if word.lower() in labels:
                features[('title_label',word.lower())]=1
            if titleCount==0:
                features[('first_word',word)]=1
        titleCount+=1
    counter = 0
    previousWord = ''
    for word in bodyText: 
        if word.lower() not in stopwords and word not in string.punctuation: 
            features[('body_word',word)]=1
            features[('body_word_stemmed',stem(word).lower())]=1
            # if postags[counter][1]=='JJ':
            #   features[('body_jj',word)]=1
            # if postags[counter][1]=='NN':
            #   features[('body_nn',word)]=1
            if word.lower() in labels:
                features[('body_label',word.lower())]=1
            # if counter != 0:
            #   features[('bigram',previousWord.lower()+word.lower())]=1
            previousWord=word
            # for key,value in gazeteer.iteritems():
            #   if word.lower() in value: 
            #       features[(key+'gazeteer',word.lower())]=1

        counter+=1
    features[('length_review',len(bodyText))]=1
    return features
def pre_process(query, stem = True):
    listterm = list()
    if stem:
        listterm = [porter2.stem(term) for term in query.split()]
    else:
        listterm = query.split()
    return listterm
Example #7
0
 def _stemDocs(self):
   for doc in self._dWords:
     new_doc = ''
     for word in doc:
       new_doc += porter2.stem(word) + ' '
     self._doc_stem.append(new_doc)
     self._doc_stem_words.append(new_doc.split())
   print ">>> Docs stemmed"
Example #8
0
 def stem(self, word):
     try:
         if self.stemming_algo == 'porter':
             return porter2.stem(word)
         elif self.stemming_algo == 'lovins':
             return lovins.stem(word)
         else:
             return paicehusk.stem(word)
     except Exception, e:
         pass
Example #9
0
 def stem(self, word):
     try:
         if self.stemming_algo == 'porter':
             return porter2.stem(word)
         elif self.stemming_algo == 'lovins':
             return lovins.stem(word)
         else:
             return paicehusk.stem(word)
     except Exception, e:
         pass
def performStemming(filteredReviewTxt):
    stemmedLst = []
    for word in filteredReviewTxt:
        #print word
        if not word:
            continue
        else:
            stemmedword = porter2.stem(word)
            stemmedLst.append(stemmedword)
    stemmedOutput = str(' '.join(j for j in stemmedLst))
    return stemmedOutput
Example #11
0
def stem(caller, word):
	global _orengostemmer

	lang = getattr(caller, "lang", "en")
	if lang == "en":
		return porter2.stem(word)
	elif lang == "pt":
		if _orengostemmer is None:
			from ptstemmer.implementations.OrengoStemmer import OrengoStemmer
			_orengostemmer = OrengoStemmer()
		return _orengostemmer.getWordStem(word)
	else:
		return word
Example #12
0
def Stemming():
    filesList = glob.glob(dirPath)
    #ps = PorterStemmer()
    global listWord
    global noScanDocs, noStems
    for files in filesList:
        noScanDocs += 1
        textFile = open(files, "r")
        words = textFile.read().lower()
        plainWord = re.sub('<[^>]*>', '', words)
        listWord = Tokenize(plainWord)
        for w in listWord:
            noStems += 1
            stemWord[porter2.stem(w)] += 1
def getFeatures(attraction,title,bodyText,labels,stopwords): 
    # dictionary to hold features 
    features = defaultdict() 

    # NOTE: if features are commented out, they were experimented with but deemed 
    # as either unuseful or overfitting 

    # loop through words in the attraction
    for word in attraction: 
        features[('attraction_word',word)]=1
    titleCount=0

    # loop through words in title 
    for word in title: 
        # check not in stopwords
        if word.lower() not in stopwords: 
            features[('title_word',word.lower())]=1
            if word.lower() in labels:
                features[('title_label',word.lower())]=1
            if titleCount==0:
                features[('first_word',word)]=1
        titleCount+=1
    counter = 0
    previousWord = ''

    # lop through words in body text 
    for word in bodyText: 
        # check not punctuation or stopword
        if word.lower() not in stopwords and word not in string.punctuation: 
            features[('body_word',word)]=1
            features[('body_word_stemmed',stem(word).lower())]=1
            # unuseful features are commented out here 
            # if postags[counter][1]=='JJ':
            #   features[('body_jj',word)]=1
            # if postags[counter][1]=='NN':
            #   features[('body_nn',word)]=1
            if word.lower() in labels:
                features[('body_label',word.lower())]=1
            # if counter != 0:
            #   features[('bigram',previousWord.lower()+word.lower())]=1
            previousWord=word
            # for key,value in gazeteer.iteritems():
            #   if word.lower() in value: 
            #       features[(key+'gazeteer',word.lower())]=1

        counter+=1
    features[('length_review',len(bodyText))]=1
    return features
Example #14
0
def abstract_words(abstract):
    """Get each word in the abstract, making chars lower case, removing
    non-alphabetic chars and spaces."""
    allowable_chars = unicode("abcdefghijklmnopqrstuvwxyz ")
    trimmed = "".join([char for char in abstract.lower()
                       if char in allowable_chars])
    words = []
    for word in trimmed.split(" "):
        if len(word) <= 1 or word in COMMON_ENGLISH:
            continue
        try:
            stemmed_word = stem(word)
        except ValueError:
            stemmed_word = word
        words.append(stemmed_word)
    return words
    def processQuery(self,doc_list):
        """
        This is the main function which calculates scores for all documents and ranks them in decreasing order of score
        """
        #Tokenize query
        #Add punctuation handling functionality
        if '.' in self.query:
            self.query.replace('.',' ')
        if "'" in self.query:
            self.query("'","")
        if "`" in self.query:
            self.query.replace("'","")
        if '''"''' in self.query:
            self.query.replace('''"''',"")
        self.query=self.query.decode("utf8")
        words=nltk.word_tokenize(self.query.lower())
        score={}
        score=defaultdict(lambda:0,score)
        for word in words:

            #Stem words to bring into base form 
            word=porter2.stem(word)
            #print word
            #Fetching Document Frequency of the query word.
            cursor=self.db.cursor()
            sql="SELECT FREQ from DOC_FREQ where WORD='%s'"%word
            cursor.execute(sql)
            if cursor.rowcount>0:
                dft=cursor.fetchone()[0];
                #print dft
                idf=self.calculate_IDF(dft);
            else:
                idf=0
            for doc in doc_list:
                tf=self.get_TF(word,doc)
                #print "tf="+str(tf)
                #print "idf="+str(idf)
                tw=self.getTermWeight(tf,doc)
                #print tw
                score[doc]=score[doc]+idf*tw
                #print "score-%d"%score[doc]
        #We have now calculated the score of documents with respect to our query.
        result=[]
        for doc,score in sorted(score.iteritems(),key=lambda (k,v):(v,k)):
            result.append((doc_list[doc],score))
        result.reverse()
        return self.final_result(result)
Example #16
0
def get_matrix(file_name):
    # 辞書オブジェクトallwordsを空で初期化
    allwords = {}
    # リストオブジェクトarticlewordsとtitkesを空で初期化
    articlewords = []
    titles = []
    art_no = 0
    # 記事のファイルをオープン
    f = open(file_name)
    # 最初の行をlineに読み込む
    line = f.readline()
    # ファイル内の各行を1行ずつ最後の行まで処理する
    while line:
        # 記事の単語辞書を空で追加(後でart_noでアクセスできるように初期化)
        articlewords.append({})
        # "|"で分けてリストにし,最初の要素をタイトルリストに追加
        titles.append(re.split('\|', line)[0])
        # 1文字以上の任意の非英数文字列で分割してリストにし,リスト要素のうち4文字以上の文字列のリストをセット
        words = [s.lower() for s in re.split('\W+', line) if len(s) > 3]
        # wordsリスト内の各文字列を処理
        for word in words:
            # 語幹を取り出す
            word_stem = stem(word)
            # word_stemがキーとして辞書allwordsに登録されていなければ要素'word_stem:0'をallwordsに追加
            ###### ここにコーディング ######
            allwords.setdefault(word_stem, 0)
            # 辞書allwordsに登録されているキーword_stemの値を1増やす
            allwords[word_stem] = allwords[word_stem] + 1
            ###### ここにコーディング ######
            # word_stemがキーとして処理中の記事の単語辞書articlewords[art_no]に登録されていなければ'word_stem:0'をarticlewords[art_no]に追加
            articlewords[art_no].setdefault(word_stem, 0)
            # 処理中の記事の単語辞書articlewords[art_no]に登録されているキーword_stemの値を1増やす
            articlewords[art_no][word_stem] += 1
        # 次の行をlineに読み込む
        line = f.readline()
        art_no += 1
    f.close
    # 出現単語の内,2回以上出現したもののみwordvecに入れる
    wordlist = []
    for w, c in allwords.items():
        if c > 1:
            wordlist.append(w)
    # タイトルのリスト,単語のリスト,各記事に単語は何回出現したかの行列を返す.
    # 記事art_noに出現する単語はarticlewords[art_no]に格納されているが,その内,wordvecに格納されているもののみ使って行列を作る
    return titles, wordlist, matrix([[(word in f and f[word] or 0)
                                      for word in wordlist]
                                     for f in articlewords])
Example #17
0
 def _sumStem(self):
   k            = 1.0
   weighted_sum = 0.0
   num_docs     = len(self._dWords)
   doc_len_avg  = 0.0
   
   for doc in self._doc_stem_words:
     doc_len_avg += len(doc)-1.0 # Subtract 1 to 
   doc_len_avg = doc_len_avg / len(self._dWords) # The averaging step
   
   # The main tf.idf loop
   # For each query we calculate the tf.idf for each document and add this to the list.
   # This means query_weights will be len(documents) and weighted sums will be len(queries)
   # and the overall size will be len(queries)*len(documents) ?
   for query in self._qWords:
     query_weights = []
     print ">>> Processing query " + query[0]
     
     for doc in self._doc_stem_words:
       doc_len = len(doc)-1.0
       weighted_sum = 0.0
       
       for word in query[1:]:
         word = porter2.stem(word) # Inline stemming of query words
         tf_wq = query.count(word)
         tf_wd = doc.count(word)
         df_w  = 0.0
         tf_idf = 0.0
         # No point calculating the tf.idf if we know it's going to be zero
         if (tf_wd != 0):
           df_w = self._numStemDocsContain(word) # This step takes ages. :(
           tf_idf = (tf_wq*(tf_wd / (tf_wd + ((k*doc_len)/doc_len_avg) ))*( math.log(num_docs/df_w) ))
         weighted_sum += tf_idf
         
       # Only care about things with a weight above 0
       if (weighted_sum != 0):
         query_weights.append((query[0], doc[0], str(weighted_sum)))
     
     self._weighted_sums.append(query_weights)
     
   if (self._performWrite):
     self._writeOut()
     print ">>> Writings tf.idf results to tfidf.top"
Example #18
0
def azureml_main(df_message=None, df_features=None):
    # Merging the header and the message body
    heading = df_message.Heading.iloc[0]
    if not heading:
        heading = ''

    content = df_message.Content.iloc[0]
    org_message = heading.lower() + " " + content.lower()

    feature_vec = list(df_features.iloc[0, :])

    # PREPROCESSING
    #--------------------
    # Replace various patterns
    message = org_message
    for key, pattern_list in patterns:
        for p in pattern_list:
            message = re.sub(p, key, message)

    message.replace('kjempe', '')
    message.replace('mega', '')
    message.replace('super', '')

    message = re.split('\W+', message)
    message = [w for w in message if w not in words_to_ignore]

    message = [stem(w) for w in message]

    # FIND FEATURES IN MESSAGE
    #-----------------------------
    # For each message, count the features which occurs in that particular message
    message_features = [0] * len(feature_vec)
    for idx, feat in enumerate(feature_vec):
        message_features[idx] = message.count(feat)

    d = {'0': message_features}
    df_output = pd.DataFrame().from_dict(d, orient='index')
    df_output.columns = df_features.columns

    # Return value must be of a sequence of pandas.DataFrame
    return df_output,
Example #19
0
def stemWords(data):

    jstuff = []

    for word in data:
        suffix = ''
        root = ''
        if word.lower() not in stopList:
            if '*' not in word:
                root = stem(word)
                print root

                for i,s in enumerate(difflib.ndiff(root, word)):
                    if s[0]==' ': continue
                    #elif s[0]=='-':
                    #    print(u'Delete "{}" from position {}'.format(s[-1],i))
                    elif s[0]=='+':
                        print(u'Add "{}" to position {}'.format(s[-1],i))
                        suffix += s[-1]
                jstuff.append({'word':root, 'variance':suffix})
    return jstuff
def word_stem_stop_word(reply_text,num):
	stopwordsfile = open('stopwords.txt')
	stopwords = stopwordsfile.read().split('\r\n')
	nltk_word = nltk.word_tokenize(reply_text)
	nltk_word = nltk.pos_tag(nltk_word)
	reply = []
	proper_nouns = Set([])
	for word, tag in nltk_word:
		if str(word.lower()) not in stopwords:
			if (tag == 'NNP' or tag == 'NNPS'):
				proper_nouns.add(word.lower())
			else:
				word = correct.correct(word)
				word = porter.stem(word)
				word = correct.correct(word)
			reply.append(word.lower())
	#print reply
	#print proper_nouns
	reply = ' '.join(reply)
	if num:
		return reply,proper_nouns
	else:
		return reply
Example #21
0
def applyStemming(stopWordsRemovedAbstract):
    stemmmingAppliedAbstract = []
    for word in stopWordsRemovedAbstract:
        stemmmingAppliedAbstract.append(stem(word))
    return stemmmingAppliedAbstract
Example #22
0
def thirty():
    with open("./medline.txt.send.tok") as f:
        for token in f:
            print(porter2.stem(token.strip()))
Example #23
0
def stem(words):
  return {porter2.stem(word) for word in words}
Example #24
0
text2_tokens = wgetAndTokenize(text2_url)

# make RDD with list of words along with their position in the original text (so we can find context later)
text1_tokensRDD = sc.parallelize(text1_tokens).zipWithIndex()
text2_tokensRDD = sc.parallelize(text2_tokens).zipWithIndex()
#print text1_tokensRDD.take(5)

# get rid of sequences of non-word chars, keep remaining strings with something in them, and not in stop list:
text1_tokensRDD = text1_tokensRDD.map(lambda p: (re.sub('\W+', '', p[0]).lower(
), p[1])).filter(lambda p: len(p[0]) > 0 and not p[0] in stop_words)
print text1_tokensRDD.take(5)
text2_tokensRDD = text2_tokensRDD.map(lambda p: (re.sub('\W+', '', p[0]).lower(
), p[1])).filter(lambda p: len(p[0]) > 0 and not p[0] in stop_words)

# stem the words using imported stem function (chosen arbitrarily)
text1_stemmedRDD = text1_tokensRDD.map(lambda p: (stem(p[0]), p[1]))
print text1_stemmedRDD.take(5)
text2_stemmedRDD = text2_tokensRDD.map(lambda p: (stem(p[0]), p[1]))

t1raw = text1_stemmedRDD.toDF(['entry', 'locus'])
t1raw.show()

t2raw = text2_stemmedRDD.toDF(['entry', 'locus'])

t1raw.registerTempTable("t1raw")
t2raw.registerTempTable("t2raw")

bg1 = sqlContext.sql(
    "select a.entry a1, b.entry b1, a.locus, b.locus from t1raw a cross join t1raw b where a.entry < b.entry and a.locus - b.locus < 7 and b.locus - a.locus < 7"
)
bg1.show(4)
Example #25
0
 def stemming(self, index, text, data):
     from porter2 import stem
     
     if len(text) <= 129:
         text += ' twentyfivesentence'
     elif len(text) <= 181:
         text += ' fiftysentence'
     elif len(text) <= 243:
         text += ' seventyfivesentence'
     else:
         text += ' largesentence'
     
     for i in xrange(text.count('%')):
         text += ' uniqpercent'
     for i in xrange(text.count('@')):
         text += ' uniqatmark'
     for i in xrange(text.count(',')):
         text += ' uniqcomma'
     for i in xrange(text.count("'")):
         text += ' uniqapostrophe'
     for i in xrange(text.count('...')):
         text += ' uniqellipses'
     for i in xrange(text.count(':')):
         text += ' uniqcolon'
     for i in xrange(text.count('!')):
         text += ' uniqexclamation'
     if '(' or ')' in text:
         text += ' uniqparentheses'
     for i in xrange(text.count('?')):
         text += ' uniqquestion'
     for i in xrange(text.count('"')):
         text += ' uniqquote'
     for i in xrange(text.count('#')):
         text += ' uniqhashtag'
     for i in xrange(text.count('0')):
         text += ' uniqzero'
     for i in xrange(text.count('1')):
         text += ' one'
     for i in xrange(text.count('2')):
         text += ' two'
     for i in xrange(text.count('3')):
         text += ' three'
     for i in xrange(text.count('4')):
         text += ' four'
     for i in xrange(text.count('5')):
         text += ' five'
     for i in xrange(text.count('6')):
         text += ' six'
     for i in xrange(text.count('7')):
         text += ' seven'
     for i in xrange(text.count('8')):
         text += ' eight'
     for i in xrange(text.count('9')):
         text += ' nine'
     if '/' in text:
         text = text.replace('/', ' ')
         text += ' forwardslash'
     
     upper = sum(1 for i in text if i.isupper())
     iterate = 1
     while upper - iterate > 0:
         text += ' formalword'
         iterate += 1
     
     
     import wikipedia
     import re
     if index % 100 == 0:
         p = 100*index/len(data)
         print "Wiki part, percent: %d" % p
     searchForEpi = data[index]['trope']
     searchForEpi = re.sub(r'([A-Z])', r' \1', searchForEpi)
     searchForTitle = data[index]['page']
     searchForTitle = re.sub(r'([A-Z])', r' \1', searchForTitle)
     search = searchForEpi + ' ' + searchForTitle
     try:
         summary = wikipedia.summary(search, sentences=1)
         newtext = str(text) + ' ' + str(summary)
     except:
         summary = None
     
     
     word_list = text.split()
     for i in xrange(len(word_list)):
         word_list[i] = stem(word_list[i])
     join_list = ' '.join(word_list)
     
     join_list += ' ' + data[index]['page']
     
     return join_list
Example #26
0
def getTextFeatures2(dirName):

	engStopWords = stopwords.words('english')
	mystopwords = ["a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount",  "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as",  "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the"]
	mergedStopWords = mergedlist = list(set(engStopWords + mystopwords))	
	print len(engStopWords), len(mystopwords), len(mergedStopWords)
	# initilizations:
	ignoreCase = True
	iDoc = 0
	tf = []
	idf = []
	tfidf = []
	nWords = []
	words = []
	allFiles = []

	for subdir, dirs, files in os.walk(dirName):
	    files.sort() 
	    for file in files:  							# for each file in the given directory:
		file = dirName + file							# update the list of files analyzed:
		allFiles.append(file)
		curWords = []
		# print repr(iDoc) + " of " + repr(len(files)) + file
		nWords.append(0) 							# add count of total words in the current document
		for line in open(file): 						# for each file in the current document:
			if ignoreCase: line = line.lower()
			tokenizer = RegexpTokenizer('[\d\.]+|\w+|\$[\d\.]+')		# initialize tokenizer
			tokens = tokenizer.tokenize(line)
			
			for word in tokens: # for each word:
				if len(word) > 2 and not word.isdigit() and word not in mergedStopWords: 	# if the word is not in the list of stop words and its length is at least 3
					# word = WordNetLemmatizer().lemmatize(word, 'v') # stemming
					# word = PorterStemmer().stem_word((word))
					word = stem(word)
					if word.isdigit():
						continue
					nWords[iDoc] += 1; 				# update the number of total words in the document
					curWords.append(word)
					
					if word not in words:				# if the current word is not in the GLOBAL list of words (bag of words):
						tf.append([0]*len(files))		# add zeros to the tf matrix
						tfidf.append([0]*len(files))		# add zeros to the tf-idf matrix
						tf[-1][iDoc] += 1			# increase by 1 the tf value of the current element for the current word
						words.append(word)			# add the word to the bag of words
						#print len(words)
						idf.append(0)				# add a zero to the idf array
					else:
						idxWord = words.index(word)		# find the index of the current word in the bag of words
						tf[idxWord][iDoc] += 1			# update the term frequency matrix
			
		nGrams2 = ngrams(curWords, 2)
		for ngram, count in nGrams2.iteritems():
			if count>1:
				if ngram not in words:				# if the current word is not in the GLOBAL list of words (bag of words):
					tf.append([0]*len(files))		# add zeros to the tf matrix
					tfidf.append([0]*len(files))		# add zeros to the tf-idf matrix
					tf[-1][iDoc] += count			# increase by 1 the tf value of the current element for the current word
					words.append(ngram)			# add the word to the bag of words	
					#print "NGRAM: " + str(len(words))
					idf.append(0)				# add a zero to the idf array
				else:
					idxWord = words.index(ngram)		# find the index of the current word in the bag of words
					tf[idxWord][iDoc] += 1			# update the term frequency matrix

			
		iDoc = iDoc + 1 						# current number of processed documents:
		
		# CLEAR NOT FREQUENT VALUES:
#		if iDoc % 10 == 0:
#			print "DOC " + str(iDoc) + " of " + str(len(files)) + " - - - # WORDS " + str(len(words))

		# BUG!!!!!!!!!!!!!!!!!!!!
		if (iDoc % 500 == 0) | (iDoc == len(files)):
			toRemove = []
			print  "            CLEANING: DOC " + str(iDoc) + " of " + str(len(files)) + " - - - - - Words before cleaning" , len(words), 
			for w in range(len(words)):
				countDocs = sum(x > 0 for x in tf[w])
				if countDocs < 3:
					toRemove.append(w)

			for rindex in sorted(toRemove, reverse=True):
			    	del tf[rindex]
				del tfidf[rindex]
				del idf[rindex]
				del words[rindex]

			print " Words after cleaning" , len(words)
	
	numOfDocs = float(iDoc); 							# total number of processed docs

	# post process: compute the final tf array and compute the idf counter:
	for i in range(len(tf)): 							# for each word
		for j in range(len(tf[i])): 						# for each document
			if tf[i][j] > 0:						# if the currect word has appearred (at least once) in the current document:
				idf[i] += 1.0						# update the idf counter
			if (nWords[j] > 0):
				tf[i][j] = tf[i][j] / float(nWords[j])			# normalize the tf value
			else:
				tf[i][j] = 0.0

	T1 = 1.0

	for i in range(len(idf)):
		if idf[i]<T1:
			idf.pop(i)
			tf.pop(i)
			words.pop(i)
			tfidf.pop(i)

	dFreq = []

	# compute the final tf value
	for i in range(len(idf)):
		dFreq.append(idf[i] / numOfDocs)
		idf[i] = 1.0 + math.log10(numOfDocs/idf[i])
	
	# compute the tf-idf value:
	for i in range(len(tf)): 							# for each word
		for j in range(len(tf[i])): 						# for each document
			tfidf[i][j] = idf[i] * tf[i][j]

	return (allFiles, words, tf, idf, tfidf, dFreq)
Example #27
0
    def stemming(self, index, text, data):
        from porter2 import stem

        if len(text) <= 129:
            text += ' twentyfivesentence'
        elif len(text) <= 181:
            text += ' fiftysentence'
        elif len(text) <= 243:
            text += ' seventyfivesentence'
        else:
            text += ' largesentence'

        for i in xrange(text.count('%')):
            text += ' uniqpercent'
        for i in xrange(text.count('@')):
            text += ' uniqatmark'
        for i in xrange(text.count(',')):
            text += ' uniqcomma'
        for i in xrange(text.count("'")):
            text += ' uniqapostrophe'
        for i in xrange(text.count('...')):
            text += ' uniqellipses'
        for i in xrange(text.count(':')):
            text += ' uniqcolon'
        for i in xrange(text.count('!')):
            text += ' uniqexclamation'
        if '(' or ')' in text:
            text += ' uniqparentheses'
        for i in xrange(text.count('?')):
            text += ' uniqquestion'
        for i in xrange(text.count('"')):
            text += ' uniqquote'
        for i in xrange(text.count('#')):
            text += ' uniqhashtag'
        for i in xrange(text.count('0')):
            text += ' uniqzero'
        for i in xrange(text.count('1')):
            text += ' one'
        for i in xrange(text.count('2')):
            text += ' two'
        for i in xrange(text.count('3')):
            text += ' three'
        for i in xrange(text.count('4')):
            text += ' four'
        for i in xrange(text.count('5')):
            text += ' five'
        for i in xrange(text.count('6')):
            text += ' six'
        for i in xrange(text.count('7')):
            text += ' seven'
        for i in xrange(text.count('8')):
            text += ' eight'
        for i in xrange(text.count('9')):
            text += ' nine'
        if '/' in text:
            text = text.replace('/', ' ')
            text += ' forwardslash'

        upper = sum(1 for i in text if i.isupper())
        iterate = 1
        while upper - iterate > 0:
            text += ' formalword'
            iterate += 1

        import wikipedia
        import re
        if index % 100 == 0:
            p = 100 * index / len(data)
            print "Wiki part, percent: %d" % p
        searchForEpi = data[index]['trope']
        searchForEpi = re.sub(r'([A-Z])', r' \1', searchForEpi)
        searchForTitle = data[index]['page']
        searchForTitle = re.sub(r'([A-Z])', r' \1', searchForTitle)
        search = searchForEpi + ' ' + searchForTitle
        try:
            summary = wikipedia.summary(search, sentences=1)
            newtext = str(text) + ' ' + str(summary)
        except:
            summary = None

        word_list = text.split()
        for i in xrange(len(word_list)):
            word_list[i] = stem(word_list[i])
        join_list = ' '.join(word_list)

        join_list += ' ' + data[index]['page']

        return join_list
Example #28
0
def my_stem(word):
	if word == word.upper() or (len(word) >= 2 and word[0:2] == "__"):
		return word
	else:
		return stem(word.lower())
Example #29
0
    def check(self, candidate):
        if len(candidate) == 0:
            return False

        normed = stem(candidate.lower())
        return normed in self.stems
def BOW(db,corpus,cwd):
	"""
	This function creates the bag-of-words representation for all the corpus documents and creates a database for all words.

	"""
	TOTAL_WORDS=0
	doc_freq=dict()
	DOC_LIST=dict()
	doc_dic=dict()
	no_of_doc=0
	words=[]
	#Create the DOCF table
	db.create_table_docf();
	# Execute shell script which get document list from corpus folder
	command="sh getdoclist.sh "+corpus+" "+cwd
	ret=os.system(command)
	if ret!=0:
		print "Error creating document list\n."
		exit(1)		
	stop=string.punctuation #punctuation removed
	#Open file containing name of documents in corpus
	fin=open("doclist","r")
	#Open each document at a time and construct Bag of Words for each document
	
	for line in fin:
		openfile=corpus+"/"+line.strip()
		fdoc=open(openfile,"r")
		no_of_doc+=1
		#Read opened doc line by line
		for sentence in fdoc:
			#convert into lower case and tokenize using nltk
			#Add functionality to handle punctuation '.' and ''
			if '.' in sentence:
				sentence.replace('.',' ')
			if "'" in sentence:
				sentence.replace("'","")
			if "`" in sentence:
				sentence.replace("'","")
			if '''"''' in sentence:
				sentence.replace('''"''',"")
			sentence=sentence.decode("utf8")
			sentence=nltk.word_tokenize(sentence.lower())
			#if wrd from sentence list , not in stop 'list' then add it to a list of words for doc
			for word in sentence:
				if word not in stop:
					#Add suitable stemmer here-Porter
					word=porter2.stem(word)
					doc_dic[word]=doc_dic.get(word,0)+1
					TOTAL_WORDS=TOTAL_WORDS+1
			#for x in words:
			#	print x
		#Create table corresponding to the Doc. Splitting 'doc_name.txt' and creating a table named docname
		docname="d"+line.split('.')[0]
		DOC_LIST[docname]=line.strip();
		db.create_table_doc(docname)
		#All words of the doc are added to words. Now add them to doc db
		#Also updates Doc Freq for the words
		temp=[(word,count) for word,count in doc_dic.items()]
		for word,count in temp:
			doc_freq[word]=doc_freq.get(word,0)+1
			db.insert_into_doc(docname,word,count)
		doc_dic.clear()

	#Close file doc_list
	fin.close()
	#All Documents are processed and their corresponding tables made.
	#Also doc_freq now contains the list of words and the number of documents in which they occur.
	#Set no. of corpus documents
	db.set_no_of_doc(no_of_doc)
	db.set_total_words(TOTAL_WORDS)
	#Add doc_freq to doc_freq table
	temp=[(word,doc_count) for word,doc_count in doc_freq.items()]
	for word,doc_count in temp:
		db.add_to_doc_freq(word,doc_count)
	return DOC_LIST 
Example #31
0
def stem_phrase(phrase):
    words = phrase.split()
    for i in xrange(0, len(words)):
        words[i] = porter2.stem(words[i])

    return ' '.join(words)
Example #32
0
def BOW(db, corpus, cwd):
    """
	This function creates the bag-of-words representation for all the corpus documents and creates a database for all words.

	"""
    TOTAL_WORDS = 0
    doc_freq = dict()
    DOC_LIST = dict()
    doc_dic = dict()
    no_of_doc = 0
    words = []
    #Create the DOCF table
    db.create_table_docf()
    # Execute shell script which get document list from corpus folder
    command = "sh getdoclist.sh " + corpus + " " + cwd
    ret = os.system(command)
    if ret != 0:
        print "Error creating document list\n."
        exit(1)
    stop = string.punctuation  #punctuation removed
    #Open file containing name of documents in corpus
    fin = open("doclist", "r")
    #Open each document at a time and construct Bag of Words for each document

    for line in fin:
        openfile = corpus + "/" + line.strip()
        fdoc = open(openfile, "r")
        no_of_doc += 1
        #Read opened doc line by line
        for sentence in fdoc:
            #convert into lower case and tokenize using nltk
            #Add functionality to handle punctuation '.' and ''
            if '.' in sentence:
                sentence.replace('.', ' ')
            if "'" in sentence:
                sentence.replace("'", "")
            if "`" in sentence:
                sentence.replace("'", "")
            if '''"''' in sentence:
                sentence.replace('''"''', "")
            sentence = sentence.decode("utf8")
            sentence = nltk.word_tokenize(sentence.lower())
            #if wrd from sentence list , not in stop 'list' then add it to a list of words for doc
            for word in sentence:
                if word not in stop:
                    #Add suitable stemmer here-Porter
                    word = porter2.stem(word)
                    doc_dic[word] = doc_dic.get(word, 0) + 1
                    TOTAL_WORDS = TOTAL_WORDS + 1
            #for x in words:
            #	print x
        #Create table corresponding to the Doc. Splitting 'doc_name.txt' and creating a table named docname
        docname = "d" + line.split('.')[0]
        DOC_LIST[docname] = line.strip()
        db.create_table_doc(docname)
        #All words of the doc are added to words. Now add them to doc db
        #Also updates Doc Freq for the words
        temp = [(word, count) for word, count in doc_dic.items()]
        for word, count in temp:
            doc_freq[word] = doc_freq.get(word, 0) + 1
            db.insert_into_doc(docname, word, count)
        doc_dic.clear()

    #Close file doc_list
    fin.close()
    #All Documents are processed and their corresponding tables made.
    #Also doc_freq now contains the list of words and the number of documents in which they occur.
    #Set no. of corpus documents
    db.set_no_of_doc(no_of_doc)
    db.set_total_words(TOTAL_WORDS)
    #Add doc_freq to doc_freq table
    temp = [(word, doc_count) for word, doc_count in doc_freq.items()]
    for word, doc_count in temp:
        db.add_to_doc_freq(word, doc_count)
    return DOC_LIST
Example #33
0
 def stem_leaf(self):
     for leaf in self.leaves:
         leaf.stem = stem(leaf.value)
Example #34
0
 def __stem(self, tok):
     tok = re.sub(r"[\.|,|:|\?|\"|\'|;|!]+$", "", tok)
     tok = porter2.stem(tok)
     return tok
Example #35
0
def stemWords(word):
    return porter2.stem(word).lower()
Example #36
0
 def __stem(self, tok):
     tok = re.sub(r"[\.|,|:|\?|\"|\'|;|!]+$", "", tok)
     tok = porter2.stem(tok)
     return tok
Example #37
0
    answer = answer.lower()

    word_list = []
    # remove all punctuation 
    for m in word.finditer(answer):
        # we are on word w
        w = m.group(0)
        # skip stopwords
        if w in STOPWORDS:
            continue
        # skip two letter words
        if len(w)<3:
            continue

        if dostem:
            w_stem = stem(w)
        else:
            w_stem = w

        word_list.append(w_stem)

    answer2 = " ".join(word_list) 

    low_file.write( answer2 + "\n" )
    ids_file.write( user_id+","+question_id + "\n" )
    
    counter += 1

print "processed ", counter, " entries"

Example #38
0
 def stem_leaf(self):
     for leaf in self.leaves:
         leaf.stem = stem(leaf.value)
Example #39
0
def getTextFeatures(dirName):

	engStopWords = stopwords.words('english')

	# initilizations:
	ignoreCase = True
	iDoc = 0
	tf = []
	idf = []
	tfidf = []
	nWords = []
	words = []
	allFiles = []
	
	for subdir, dirs, files in os.walk(dirName):
	    files.sort() 
	    for file in files:  							# for each file in the given directory:
		file = dirName + file							# update the list of files analyzed:
		allFiles.append(file)

		# print repr(iDoc) + " of " + repr(len(files)) + file
		nWords.append(0) 							# add count of total words in the current document
		for line in open(file): 						# for each file in the current document:
			if ignoreCase: line = line.lower()
			tokenizer = RegexpTokenizer('[\d\.]+|\w+|\$[\d\.]+')		# initialize tokenizer
			tokens = tokenizer.tokenize(line)
			
			for word in tokens: # for each word:
				if len(word) > 2 and word not in engStopWords: 		# if the word is not in the list of stop words and its length is at least 3
					#stemmer = SnowballStemmer("german") 		# TODO: other languages (language detection). Use SnowballStemmer.languages to see list of languages
					#word = stemmer.stem(word)					
					#word = PorterStemmer().stem_word((word))
					word = stem(word)
					# word = WordNetLemmatizer().lemmatize(word, 'v') # stemming

					nWords[iDoc] += 1; 				# update the number of total words in the document
					if word not in words:				# if the current word is not in the GLOBAL list of words (bag of words):
						tf.append([0]*len(files))		# add zeros to the tf matrix
						tfidf.append([0]*len(files))		# add zeros to the tf-idf matrix
						tf[-1][iDoc] += 1			# increase by 1 the tf value of the current element for the current word
						words.append(word)			# add the word to the bag of words
						idf.append(0)				# add a zero to the idf array
					else:
						idxWord = words.index(word)		# find the index of the current word in the bag of words
						tf[idxWord][iDoc] += 1			# update the term frequency matrix
		iDoc = iDoc + 1 							# current number of processed documents:

	numOfDocs = float(iDoc); 							# total number of processed docs

	# post process: compute the final tf array and compute the idf counter:
	for i in range(len(tf)): 							# for each word
		for j in range(len(tf[i])): 						# for each document
			if tf[i][j] > 0:						# if the currect word has appearred (at least once) in the current document:
				idf[i] += 1.0						# update the idf counter
			if (nWords[j] > 0):
				tf[i][j] = tf[i][j] / float(nWords[j])			# normalize the tf value
			else:
				tf[i][j] = 0.0

	T1 = 1.0
	idfTemp = []
	tfTemp = []
	wordsTemp = []
	tfidfTemp = []
	for i in range(len(idf)):
		if idf[i]>T1:
			idfTemp.append(idf[i])
			tfTemp.append(tf[i])
			wordsTemp.append(words[i])
			tfidfTemp.append(tfidf[i])

	idf = list(idfTemp)
	tf = list(tfTemp)
	words = list(wordsTemp)
	tfidf = list(tfidfTemp)

	dFreq = []

	# compute the final tf value
	for i in range(len(idf)):
		dFreq.append(idf[i] / numOfDocs)
		idf[i] = 1.0 + math.log10(numOfDocs/idf[i])
	
	# compute the tf-idf value:
	for i in range(len(tf)): 							# for each word
		for j in range(len(tf[i])): 						# for each document
			tfidf[i][j] = idf[i] * tf[i][j]

	return (allFiles,words, tf, idf, tfidf, dFreq)
Example #40
0
          surname, givenname = '', ''
          for data in child.iter('surname'):
            surname = data.text
          for data in child.iter('given-names'):
            givenname = data.text
          authorToNumber[string.join([givenname, surname], ' ')].append(
              xmlNumber)
    for child in itertools.chain(
        xmldata.getroot().iter('p'), xmldata.getroot().iter('title')):
      #does the text
      if child.text is not None:
        lowerText = string.lower(child.text).encode('ascii', 'ignore')
        mangledText = re.sub(r'\W+', ' ', lowerText)  # removes non alphabet
        words = string.split(mangledText)
        for word in words:
          stemmed = porter2.stem(word)  # stemming removes -ing -ed etc
          if not isNumber(stemmed):   # if converts to a float, don't do it
            numberToWords.update([stemmed])
        numberToWordCount[xmlNumber] += len(words)
  except xml.etree.ElementTree.ParseError:  # article not valid xml
    pass
  outNTW.write(str(xmlNumber) + ' ' + str(len(numberToWords)) + ' ')
  for oneWord in numberToWords:
    outNTW.write(oneWord + ' ')
  outNTW.write('\n')
outNTW.close()
#just write these to plaintext files. janky but lets others use the data
#easily without having to unpack pickled files or whatever other solution
#i could choose from.
outNTT = open('number.to.type.txt', 'w')
for outdata in numberToType.iteritems():
Example #41
0
def getTextFeatures2_notfidf(dirName):

	engStopWords = stopwords.words('english')
	mystopwords = ["a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount",  "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as",  "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the"]
	mergedStopWords = mergedlist = list(set(engStopWords + mystopwords))	

	# initilizations:
	ignoreCase = True
	iDoc = 0
	nWords = []
	words = []
	allFiles = []
	dFreq = []

	for subdir, dirs, files in os.walk(dirName):
	    files.sort() 
	    for file in files:  							# for each file in the given directory:
		#if len(allFiles)>100:
		#	break;
		file = dirName + file							# update the list of files analyzed:
		allFiles.append(file)
		curWords = []
		# print repr(iDoc) + " of " + repr(len(files)) + file
		nWords.append(0) 							# add count of total words in the current document
		statinfo = os.stat(file)
		Size = statinfo.st_size
		if Size<500:
			continue
		if Size>550:
			continue
        
		for line in open(file): 						# for each file in the current document:
			if ignoreCase: line = line.lower()
			if Size<500:
				print Size
			if Size>3000:
				print Size

			tokenizer = RegexpTokenizer('[\d\.]+|\w+|\$[\d\.]+')		# initialize tokenizer
			tokens = tokenizer.tokenize(line)
			
			for word in tokens: # for each word:
				if len(word) > 2 and not word.isdigit() and word not in mergedStopWords: 	# if the word is not in the list of stop words and its length is at least 3
					# word = WordNetLemmatizer().lemmatize(word, 'v') # stemming
					# word = PorterStemmer().stem_word((word))
					word = stem(word)
					if word.isdigit():
						continue
					nWords[iDoc] += 1; 				# update the number of total words in the document
					
					if (word not in words):				# if the current word is not in the GLOBAL list of words (bag of words):
						words.append(word)			# add the word to the bag of words
						dFreq.append(1.0)				# add 1 to the dFreq array
					else:
						if (word not in curWords):			# current word is not in the list of ALREADY READ words of the current doc
							idxWord = words.index(word)		# find the index of the current word in the bag of words
							dFreq[idxWord] += 1.0			# update the dFreq matrix				
					curWords.append(word)

			
		nGrams2 = ngrams(curWords, 2)
		for ngram, count in nGrams2.iteritems():	
			if count>5:
				if ngram not in words:				# if the current word is not in the GLOBAL list of words (bag of words):
					words.append(ngram)			# add the word to the bag of words	
					#print "NGRAM: " + str(len(words))
					dFreq.append(1.0)			# add '1.0' to the dFreq array
				else:
					idxWord = words.index(ngram)		# find the index of the current word in the bag of words
					dFreq[idxWord] += 1.0			# update the freq
			
		iDoc = iDoc + 1 						# current number of processed documents:	
	#	print words, dFreq
	#	raw_input("Press ENTER to exit")
		
	numOfDocs = float(iDoc); 						# total number of processed docs

	dFreq2 = []
	words2 = []

	for i,d in enumerate(dFreq):
		if d>1:
			dFreq2.append(d)
			words2.append(words[i])
	
	dFreq = dFreq2
	words = words2

	# compute the final df value
	for i in range(len(dFreq)):
		dFreq[i] = (dFreq[i] / numOfDocs)

	print len(dFreq)
	return (allFiles, words, dFreq)
Example #42
0
def stem(words):
    return {porter2.stem(word) for word in words}
Example #43
0
File: gut2.py Project: adbreind/rye
# make RDD with list of words along with their position in the original text (so we can find context later)
text1_tokensRDD = sc.parallelize(text1_tokens).zipWithIndex()
text2_tokensRDD = sc.parallelize(text2_tokens).zipWithIndex()
#print text1_tokensRDD.take(5)

# define a list of stop words (chosen fairly arbitrarily)
stop_words = ['a', 'i', 'an', 'as', 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', 'aint', 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', 'arent', 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'both', 'brief', 'but', 'by', 'cmon', 'cs', 'came', 'can', 'cant', 'cannot', 'cant', 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containing', 'contains', 'corresponding', 'could', 'couldnt', 'course', 'currently', 'definitely', 'described', 'despite', 'did', 'didnt', 'different', 'do', 'does', 'doesnt', 'doing', 'dont', 'done', 'down', 'downwards', 'during', 'each', 'edu', 'eg', 'eight', 'either', 'else', 'elsewhere', 'enough', 'entirely', 'especially', 'et', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'exactly', 'example', 'except', 'far', 'few', 'fifth', 'first', 'five', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'four', 'from', 'further', 'furthermore', 'get', 'gets', 'getting', 'given', 'gives', 'go', 'goes', 'going', 'gone', 'got', 'gotten', 'greetings', 'had', 'hadnt', 'happens', 'hardly', 'has', 'hasnt', 'have', 'havent', 'having', 'he', 'hes', 'hello', 'help', 'hence', 'her', 'here', 'heres', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'hi', 'him', 'himself', 'his', 'hither', 'hopefully', 'how', 'howbeit', 'however', 'id', 'ill', 'im', 'ive', 'ie', 'if', 'ignored', 'immediate', 'in', 'inasmuch', 'inc', 'indeed', 'indicate', 'indicated', 'indicates', 'inner', 'insofar', 'instead', 'into', 'inward', 'is', 'isnt', 'it', 'itd', 'itll', 'its', 'its', 'itself', 'just', 'keep', 'keeps', 'kept', 'know', 'known', 'knows', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', 'lets', 'like', 'liked', 'likely', 'little', 'look', 'looking', 'looks', 'ltd', 'mainly', 'many', 'may', 'maybe', 'me', 'mean', 'meanwhile', 'merely', 'might', 'more', 'moreover', 'most', 'mostly', 'much', 'must', 'my', 'myself', 'name', 'namely', 'nd', 'near', 'nearly', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'no', 'nobody', 'non', 'none', 'noone', 'nor', 'normally', 'not', 'nothing', 'novel', 'now', 'nowhere', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'own', 'particular', 'particularly', 'per', 'perhaps', 'placed', 'please', 'plus', 'possible', 'presumably', 'probably', 'provides', 'que', 'quite', 'qv', 'rather', 'rd', 're', 'really', 'reasonably', 'regarding', 'regardless', 'regards', 'relatively', 'respectively', 'right', 'said', 'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should', 'shouldnt', 'since', 'six', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup', 'sure', 'ts', 'take', 'taken', 'tell', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', 'thats', 'thats', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'theres', 'thereafter', 'thereby', 'therefore', 'therein', 'theres', 'thereupon', 'these', 'they', 'theyd', 'theyll', 'theyre', 'theyve', 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'took', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'twice', 'two', 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use', 'used', 'useful', 'uses', 'using', 'usually', 'value', 'various', 'very', 'via', 'viz', 'vs', 'want', 'wants', 'was', 'wasnt', 'way', 'we', 'wed', 'well', 'were', 'weve', 'welcome', 'well', 'went', 'were', 'werent', 'what', 'whats', 'whatever', 'when', 'whence', 'whenever', 'where', 'wheres', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whos', 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', 'wont', 'wonder', 'would', 'wouldnt', 'yes', 'yet', 'you', 'youd', 'youll', 'youre', 'youve', 'your', 'yours', 'yourself', 'yourselves', 'zero']
 
# get rid of sequences of non-word chars, keep remaining strings with something in them, and not in stop list:
text1_tokensRDD = text1_tokensRDD.map(lambda p:(re.sub('\W+', '', p[0]).lower(), p[1])).filter(lambda p:len(p[0])>0 and not p[0] in stop_words) 
#print text1_tokensRDD.take(5)
text2_tokensRDD = text2_tokensRDD.map(lambda p:(re.sub('\W+', '', p[0]).lower(), p[1])).filter(lambda p:len(p[0])>0 and not p[0] in stop_words) 

# stem the words using imported stem function (chosen arbitrarily)
text1_stemmedRDD = text1_tokensRDD.map(lambda p:(stem(p[0]), p[1])) 
#print text1_stemmedRDD.take(5)
text2_stemmedRDD = text2_tokensRDD.map(lambda p:(stem(p[0]), p[1])) 

# for each word, get the list of loci:
text1_concRDD = text1_stemmedRDD.groupByKey()
#print text1_concRDD.take(5)
text2_concRDD = text2_stemmedRDD.groupByKey()

# find every pair of words (brute force)
text1_bigrams = text1_concRDD.cartesian(text1_concRDD)
#print text1_bigrams.first()
text2_bigrams = text2_concRDD.cartesian(text2_concRDD)

# eliminate transposed pairs, and dupes -- keep ("a","b"); not ("b", "a") or ("a", "a") etc
text1_bigrams = text1_bigrams.filter(lambda p:p[0][0] < p[1][0])
Example #44
0
def classifyFile(fileName, dictionaries, dictionariesWeight, dictionariesNames, numOfResultsReturned, PLOT_FIGURE):

		
	engStopWords = stopwords.words('english')		
	ignoreCase = True

	nClasses = len(dictionaries)

	Ps  = [0.0] * nClasses
	nWords = 0;
	curWords = []
	curFreqs = []
	totalWords = 0
	tokenizer = RegexpTokenizer('[\d\.]+|\w+|\$[\d\.]+')		# initialize tokenizer
	# STEP A: GENERATE LIST OF WORDS (AFTER STEMMING AND STOPWORD REMOVAL):
	for line in open(fileName): 						# for each file in the current document:
		if ignoreCase: line = line.lower()
		tokens = tokenizer.tokenize(line)	
		
		for word in tokens: # for each word:
			if len(word) > 2 and word not in engStopWords: 		# if the word is not in the list of stop words and its length is at least 3
				#word = WordNetLemmatizer().lemmatize(word, 'v') # stemming
				# word = PorterStemmer().stem_word((word))
				word = stem(word)
				totalWords += 1.0
				if word not in curWords:
					curWords.append(word)
					curFreqs.append(1.0)
				else:
					curFreqs[curWords.index(word)] += 1.0

	normalizeFactor = (totalWords / 15.0)

	# STEP B: PROPABILITY PRODUCT COMPUTATION (BASED ON SINGLE WORDS)
	for iword, word in enumerate(curWords):
		FOUND_word = 0;
		for d in range(len(dictionaries)):
			dic = dictionaries[d]			
			if word in dic:
				idxWord = dic.index(word)
				toMulti = 1.0 + dictionariesWeight[d][idxWord]				
				#Ps[d] *= (toMulti + (1.0 - (1.0/nClasses)))**(curFreqs[iword]/normalizeFactor)	
				#if toMulti>20:
				Ps[d] += math.log(toMulti)								
				FOUND_word = 1;
			else:
				#Ps[d] *= (1.0 - (1.0/nClasses))**(1.0/normalizeFactor)
				Ps[d] += (0.0)
		if (FOUND_word==1):
			#print word
			nWords += 1

	print Ps
	
	# STEP C: PROBABILITY PRODUCT COMPUATION (BASED ON N-GRAMS):
	nGrams2 = ngrams(curWords, 2)
	for ngram, count in nGrams2.iteritems():
		FOUND_word = 0;
		for d in range(len(dictionaries)):
			dic = dictionaries[d]	
			if ngram in dic:
					idxWord = dic.index(ngram)
					toMulti = 1.0 + dictionariesWeight[d][idxWord]
					Ps[d] += math.log(toMulti)
					# Ps[d] *= (toMulti + (1.0 - (1.0/nClasses)))**(1.0/normalizeFactor)
					# print nGram, toMulti
					FOUND_word = 1;
			else:
					#Ps[d] *= (1.0 - (1.0/nClasses))**(1.0/normalizeFactor)
					Ps[d] += 0.0
			if (FOUND_word==1):
				#print word
				nWords += 1
	print Ps
	
	for d in range(nClasses):
		if nWords>0:        
			Ps[d] /= len(curWords)
			# Ps[d] /= (len(dictionariesWeight[d])+0.00000000001)
			# Ps[d] /= nWords
			# Ps[d] /= sum(dictionariesWeight[d])    
		else:
			Ps[d] = 0	
	
	MEANPs = mean(Ps)
	MAX = max(Ps)

	finalLabels = []
	finalLabelsPs = []


	IndecesSorted = [i[0] for i in sorted(enumerate(Ps), key=lambda x:x[1], reverse=True)]
	
	for i in range(numOfResultsReturned):			
		finalLabels.append(dictionariesNames[IndecesSorted[i]])
		#finalLabelsPs.append(Ps[IndecesSorted[i]] / MAX)
		finalLabelsPs.append(Ps[IndecesSorted[i]])

#	for i in range(len(Ps)):
#		if Ps[i] > 2.0 * MEANPs:
#			# print(dictionariesNames[i] + "\t\t\t\t" + str(Ps[i]))
#			finalLabels.append(dictionariesNames[i])
#			finalLabelsPs.append(Ps[i])

	if (PLOT_FIGURE==1):
		fig = plt.figure()
		plt.bar(arange(1,numOfResultsReturned+1)-0.5, array(finalLabelsPs))
		for i in range(numOfResultsReturned):
			plt.text(i+1, 0, finalLabels[i], rotation=90, size=10,horizontalalignment='center',verticalalignment='bottom')
		plt.xticks(range(numOfResultsReturned), [], size='small')
	
		plt.show();

	#print nClasses
	#plt.savefig('new.png', dpi = 500);
	
	return (finalLabels, finalLabelsPs)	
def stemWords(word):
    return porter2.stem(word).lower()
Example #46
0
 def clean_word(w):
     return re.sub('\,|\.|\;|\:|\;|\?|\!|\[|\]|\}|\{|\/|\\\\', '', stem(w.lower()))
Example #47
0
def processWikiText(ipfile,file):
    f=open(ipfile,"r")
    pagetrie=dict()
    lines=f.readlines()
    f.close()
    lenlines=lines.__len__()
    i=0
    count=0
    title=''
    docId=''
    text=''
    infoflag=0
    catflag=0
    txtflag=0
    outflag=0
    tflag=0
    start=time.time()
    while i<lenlines:
        line=lines[i].lstrip(" ")
        line=line.rstrip("\n")
        if line=="<page>":
            i+=1
            line=lines[i].lstrip(" ")
            line=line.rstrip("\n")
            if line[:7]=="<title>":
                title=line[7:-8]
            i+=2
            line=lines[i].lstrip(" ")
            line=line.rstrip("\n")
            if line[:4]=="<id>":
                docId=line[4:-5]
                docId=base.decimaltobase62(int(docId))
            term=''
            tlength=title.__len__()
            for j in xrange(tlength):
                lower=title[j].lower()
                if lower<'a' or lower>'z':
                    if term:
                        if not(stop_list.has_key(term)):
                            stem=porter2.stem(term)
                            if pagetrie.has_key(stem):
                                v=pagetrie[stem]
                                if v[5].has_key(docId):
                                    v[5][docId]+=1
                                else:
                                    v[0]+=1
                                    v[5][docId]=1
                            else:
                                pagetrie[stem]=[1,0,0,0,0,{docId:1},{},{},{},{}]
                    term=''
                else:
                    term+=lower
            if len(term)>0 and not(stop_list.has_key(term)):
                stem=porter2.stem(term)
                if pagetrie.has_key(stem):
                    v=pagetrie[stem]
                    if v[5].has_key(docId):
                        v[5][docId]+=1
                    else:
                        v[0]+=1
                        v[5][docId]=1
                else:
                    pagetrie[stem]=[1,0,0,0,0,{docId:1},{},{},{},{}]
            infoflag=0
            catflag=0
            txtflag=1
            outflag=0
            print docId,title
            i+=1
            continue
        elif line[:7]=="</page>":
            count+=1
            i+=1
            continue
        elif line[:5]=="<text":  #text flag
            l=line.split(">",1)
            text=[]
            if l[1][-7:]=="</text>":
                line=l[1].rsplit("<",1)
                text.append(l[0])
                i+=1
            else:
                text.append(l[1])
                i+=1
                while True:
                    if lines[i][-8:]=="</text>\n":
                        line=lines[i].rsplit("<",1)
                        text.append(line[0])
                        break
                    text.append(lines[i])
                    i+=1
            text="".join(text)
            text="%s\n"%(text)
            txtlength=text.__len__()
            j=0
            term=''
            txtflag=1
            prevind=0
            flag=0
            while j<txtlength:
                if text.startswith("[[",j):
                    j+=2
                    if (text.startswith("Cate",j)):
                        catflag=1
                        flag=1
                        j+=9
                    else:
                        if flag==1: break
                        outflag=1
                    term=''
                    txtflag=0
                    while 1:
                        check=0
                        if text.startswith("]",j):
                            check=1
                        lower=text[j].lower()
                        if lower<'a' or lower>'z':
                            if len(term)>2:
                                if not(stop_list.has_key(term)):
                                    stem=porter2.stem(term)
                                    if pagetrie.has_key(stem):
                                        v=pagetrie[stem]
                                        if infoflag: 
                                            if v[6].has_key(docId):
                                                v[6][docId]+=1
                                            else:
                                                v[1]+=1
                                                v[6][docId]=1
                                        if catflag: 
                                            if v[7].has_key(docId):
                                                v[7][docId]+=1
                                            else:
                                                v[2]+=1 
                                                v[7][docId]=1
                                        if outflag: 
                                            if v[8].has_key(docId):
                                                v[8][docId]+=1
                                            else:
                                                v[3]+=1 
                                                v[8][docId]=1
                                    else:
                                        pagetrie[stem]=[0,infoflag,catflag,outflag,0,{},{},{},{},{}]
                                        v=pagetrie[stem]
                                        if infoflag: v[6][docId]=1
                                        if catflag: v[7][docId]=1
                                        if outflag: v[8][docId]=1
                            term=''
                        else:
                            term+=lower
                        if check==1:
                            outflag=0
                            catflag=0
                            j+=2
                            break
                        j+=1
                    if not(infoflag): txtflag=1
                elif text.startswith("{{",j):
                    j+=2
                    if text.startswith("Info",j):
                        infoflag=1
                        txtflag=0
                        j+=7
                        #print "infostart",infoflag
                    else:
                        while 1:
                            if text[j]=="}" or text[j]=='\n':
                                j+=2
                                break
                            j+=1
                elif text.startswith("}}\n",j):
                    infoflag=0
                    txtflag=1
                    #print "infoclose",infoflag
                    j+=3
                else:
                    lower=text[j].lower()
                    if lower<'a' or lower>'z':
                        if j-prevind>3: #j-prevind-1 == length
                            term=text[prevind+1:j].lower()
                            if not(stop_list.has_key(term)):
                                stem=porter2.stem(term)
                                if pagetrie.has_key(stem):
                                    v=pagetrie[stem]
                                    if infoflag: 
                                        if v[6].has_key(docId):
                                            v[6][docId]+=1
                                        else:
                                            v[1]+=1 
                                            v[6][docId]=1
                                    if catflag: 
                                        if v[7].has_key(docId):
                                            v[7][docId]+=1
                                        else:
                                            v[2]+=1 
                                            v[stem][7][docId]=1
                                    if outflag: 
                                        if v[8].has_key(docId):
                                            v[8][docId]+=1
                                        else:
                                            v[3]+=1 
                                            v[8][docId]=1
                                    if txtflag: 
                                        if v[9].has_key(docId):
                                            v[9][docId]+=1
                                        else:
                                            v[4]+=1 
                                            v[9][docId]=1
                                else:
                                    pagetrie[stem]=[0,infoflag,catflag,outflag,txtflag,{},{},{},{},{}]
                                    v=pagetrie[stem]
                                    if infoflag: v[6][docId]=1
                                    if catflag: v[7][docId]=1
                                    if outflag: v[8][docId]=1
                                    if txtflag: v[9][docId]=1
                        prevind=j
                    j+=1
                    continue
                prevind=j
        i+=1
    print "proc%d"%(file)
    printTrie(file,pagetrie)
    pagetrie.clear()
    return count
Example #48
0
k = 0
scores = {}
length = {}
while k < len(keywords):
    if len(keywords[k]) == 0:
        k += 1
        continue
    if keywords[k] in "TICOB":
        field = keywords[k]
        k += 1
        continue
    lower = keywords[k].lower()
    if stop_list.has_key(lower):
        k += 1
        continue
    stem = porter2.stem(lower)
    word = bisect.bisect_left(m, stem)
    if m[word] != stem:
        word -= 1
    #print stem,word,m[word],l[m[word]]
    seek = base.base62todecimal(l[m[word]])
    g.seek(seek)
    counter = 0
    check = 0
    while counter < width:
        line = g.readline()
        line = line.rstrip("\n")
        text = line.split("#", 1)
        if text[0] == stem:
            check = 1
            break