コード例 #1
0
def tokenize(sText, pairing=False):
    '''Given a string of text sText, returns a list of the individual stemmed tokens that 
        occur in that string (in order). This is my quick and dirty Tokenizer. 
        Satisfaction Not Guarenteed'''
    import string
    from stemmer import PorterStemmer
    sText = sText.lower()
    sText = re.sub("’", "'", sText)
    sText = re.sub("&.{0,6};", " ", sText)
    sText = re.sub("[\x80-\xff]", "", sText)
    sText = sText.split(None)
    for p in string.punctuation.replace("'", ""):
        try:
            sText = mapAndFold(lambda x: x.split(p), sText)
        except TypeError:  # empty string
            return []
    sText = mapAndFold(lambda x: x.split(), sText)
    sText = map(lambda x: x.strip("\'"), sText)
    sText = map(lambda x: x.strip("\""), sText)
    sText = map(lambda x: x.strip("_"), sText)
    sText = filter(lambda x: not re.match("\d+", x), sText)
    sText = filter(lambda x: not x == "", sText)
    sText = filter(lambda x: not x[0] == "@", sText)
    stemmer = PorterStemmer()
    if pairing:
        #return original with token val in tuple
        return [(w, stemmer.stem(w, 0, len(w) - 1)) for w in sText]
    return [stemmer.stem(w, 0, len(w) - 1) for w in sText]
コード例 #2
0
ファイル: utils.py プロジェクト: pronei/PlagarismCheck
    def __init__(self, fileA, fileB):
        self.__allWords = set()
        self.__wordsA = dict()
        self.__wordsB = dict()

        with open(fileA, 'r') as document:
            for line in document:
                words = line.strip().split()
                for word in words:
                    p = PorterStemmer()
                    word = p.stem(word, 0, len(word) - 1)
                    if word in self.__wordsA.keys():
                        self.__wordsA[word] += 1
                    else:
                        self.__wordsA[word] = 1

        with open(fileB, 'r') as document:
            for line in document:
                words = line.strip().split()
                for word in words:
                    p = PorterStemmer()
                    word = p.stem(word, 0, len(word) - 1)
                    if word in self.__wordsB.keys():
                        self.__wordsB[word] += 1
                    else:
                        self.__wordsB[word] = 1

        self.__allWords = set(self.__wordsA.keys()) | set(self.__wordsB.keys())
        self.__table = {t[1]: t[0] for t in enumerate(self.__allWords)}
コード例 #3
0
def tokenize(sText, pairing = False):
    '''Given a string of text sText, returns a list of the individual stemmed tokens that 
        occur in that string (in order). This is my quick and dirty Tokenizer. 
        Satisfaction Not Guarenteed'''
    import string
    from stemmer import PorterStemmer
    sText = sText.lower()
    sText = re.sub("’", "'", sText)
    sText = re.sub("&.{0,6};", " ", sText)
    sText = re.sub("[\x80-\xff]", "", sText)
    sText = sText.split(None)
    for p in string.punctuation.replace("'", ""):
        try:
            sText = mapAndFold(lambda x: x.split(p), sText)
        except TypeError: # empty string
            return []
    sText = mapAndFold(lambda x: x.split(), sText)
    sText = map(lambda x: x.strip("\'"), sText)
    sText = map(lambda x: x.strip("\""), sText)
    sText = map(lambda x: x.strip("_"), sText)
    sText = filter(lambda x: not re.match("\d+", x), sText)
    sText = filter(lambda x: not x == "", sText)
    sText = filter(lambda x: not x[0] == "@", sText)
    stemmer = PorterStemmer()
    if pairing:
        #return original with token val in tuple
        return [(w,stemmer.stem(w, 0, len(w)-1)) for w in sText]
    return [stemmer.stem(w, 0, len(w)-1) for w in sText]
コード例 #4
0
def stemizeQuery(query):
	# STEMIZER FOR QUERY
	porter = PorterStemmer()
	newList = []
	for q in query:
		newList.append(porter.stem(q,0,len(q)-1))
	return newList
コード例 #5
0
def stemize(dictList):
	#STEMIZER FOR DICTIONARY
	porter = PorterStemmer()		
	for dict in dictList:
		for token in dict.keys():
			dict[porter.stem(token,0,len(token)-1)] = dict.pop(token) 
	return dictList
コード例 #6
0
def ReadInfoFile(infoFile):
    global text2pddl;

    fin = open(infoFile)
    lines = fin.readlines()
    fin.close()
    lines = [ line.strip() for line in lines ];

    unitDict = {}
    p = PorterStemmer()
    for line in lines:
        if len(line)==0: continue;
        parts = line.split(':');
        textName = parts[0];
        words = [ p.stem(w.lower(),0,len(w)-1) for w in textName.split() ]
        textName = ' '.join(words);
        if len(parts)>1:
            pddlName = parts[1];
        else:
            pddlName = parts[0];
        assert(pddlName!='');
        assert(textName!='');
        for word in words:
            unitDict[word] = True

    #print unitDict.keys()

    return unitDict
コード例 #7
0
def stemWords(listTokens):
	s = PorterStemmer()
	stemmedTerms = []

	for x in listTokens:
		stemmedTerms.append(s.stem(x, 0, len(x) - 1))

	return stemmedTerms
コード例 #8
0
def stemWords(inList):
    outList = []
    ps = PorterStemmer()

    for token in inList:
        stemmed_token = ps.stem(token, 0, len(token) - 1)
        outList.append(stemmed_token)

    return outList
コード例 #9
0
ファイル: tools.py プロジェクト: davedash/mealadvisor
def stem_phrase(phrase):
    words = phrase.lower().replace('.', '').replace("'", '').split()

    # ignore stop words
    words = [word for word in words if not word in STOP_WORDS]

    p = PorterStemmer()

    return [p.stem(word, 0, len(word)-1) for word in words]
コード例 #10
0
def stemWords(tokens):
    """Function that stems the words. """
    # use porter stemmer
    #  https://tartarus.org/martin/PorterStemmer/python.txt

    p = PorterStemmer()
    for index, word in enumerate(tokens):
        tokens[index] = p.stem(word, 0, len(word) - 1)

    return tokens
コード例 #11
0
def stem(word):
    # word needs to be all lowercase before being passed to stem
    string.lower(word)  

    # fancy stuff to remove .,?!"
    mymatch = re.compile('(\,|\.|\!|\?|\")')
    word = mymatch.sub(r'',word)

    p = PorterStemmer()
    word = p.stem(word, 0,len(word)-1)
   
    return word
コード例 #12
0
def splitToken(token,isStem=True):
    toks = token.split('_')
    word = toks[0].lower()
    tag = toks[1]
    if not word.isalnum():
        tag = 'PUNC'
    if isStem:
        # simple post stem
        p = PorterStemmer()
        #word = p.stem1(word,0,len(word)-1)
        word = p.stem(word,0,len(word)-1)

    return (word, tag)
コード例 #13
0
def GetWordDictionary(emails):
    word_dict = {}
    count = 0
    stemmer = PorterStemmer()
    for email_case in emails:
        email = email_case[0]
        body = SplitText(email['body'])
        for word in body:
            modified_word = word
            if len(modified_word) > 1:
                modified_word = stemmer.stem(word, 0, len(word) - 1)

            if modified_word not in word_dict:
                word_dict[modified_word] = count
                count += 1
    return word_dict
コード例 #14
0
def main():
    # Reading the document from the file
    fileName = "cran.all.1400"

    documents = readFromFile(fileName, "r")

    # Reading stop words from the file
    stopwordsList = readFromFile("stopwords.txt", "r")
    stopwords = stopwordsList.split()

    # List that maintains the document id number, number of unique terms in document, for each term in the document, its term and it's term frequency.
    docId = 1

    # InvFileHash
    invFileHash = {}

    # Splits the multiple documents of the same file into list
    document = re.split(".I | \n.I", documents)[1:]
    totalDocument = len(document)
    print "Total documents:", totalDocument
    for doc in enumerate(document):
        startIndex = doc[1].index('.W\n')
        text = doc[1][startIndex + 3:]
        words = re.findall(r'\w+', text)

        pObj = PorterStemmer()
        listWords = {}
        for word in words:
            flagStopwords = word.lower() in stopwords
            if (not flagStopwords and word.isalpha()):
                stemWord = pObj.stem(word.lower(), 0, len(word) - 1)
                listWords = addToDict(listWords, stemWord)

        docList = addDoc(docId, listWords)
        docId += 1
        invFileHash = createInvFileHash(invFileHash, docList)

    # Writes to the Inverted File Hash file
    writeToFile(invFileHash)

    # To read the queries list from the cran query file
    queryFileRead = readFromFile("cran.qry", "r")

    # Calculate the Vector Space Model (total number of documents, stopwords list)
    vectorSpaceModel(totalDocument, queryFileRead, stopwords)
コード例 #15
0
def classify(folds, nb_or_svm, ngrams, stemming, binary):
    p = PorterStemmer()

    vectorizer = CountVectorizer(input="filename", \
                                 ngram_range=ngrams, \
                                 tokenizer=(lambda d: [(p.stem(t, 0, len(t)-1) if stemming else t) for t in d.split()]), \
                                 binary=binary, \
                                 min_df=4, max_df=1.0)

    X = vectorizer.fit_transform([f[0] for fold in folds for f in fold])

    accuracies = []
    for i in range(len(folds)):
        classifier = SVC(gamma="auto", kernel="linear") if nb_or_svm[0] == "svm" \
                else MultinomialNB(alpha=(1.0 if nb_or_svm[1] else 1.0e-10))

        start_index = 0
        for j in range(i):
            start_index += len(folds[j])
        end_index = start_index + len(folds[i])

        test_set = X[start_index:end_index]
        training_set = vstack([X[:start_index], X[end_index:]])
        classifier.fit(
            training_set,
            [f[1] for fold in (folds[:i] + folds[i + 1:]) for f in fold])

        correct_predictions = 0
        results = classifier.predict(test_set)
        for j in range(len(results)):
            correct_predictions += int(results[j] == folds[i][j][1])

        accuracies.append(100 * correct_predictions / len(results))

    if nb_or_svm[0] != "svm":
        print("smoothed" if nb_or_svm[1] else "unsmoothed", end=" ")

    print("stemmed" if stemming else "unstemmed", \
          "presence" if binary else "frequency", \
          "unigrams" if ngrams == (1, 1) else \
          ("bigrams" if ngrams == (2, 2) else \
          ("uni + bi" if ngrams == (1, 2) else "unknown")), \
          "accuracy:", sum(accuracies)/len(accuracies))
コード例 #16
0
def stemizeList(normalList):
	# STEMIZER FOR LIST
	porter = PorterStemmer()
	newList = []
	newDict = {}
	count = 0;	
	for lists in normalList:
		tokenList = []
		for token in lists:
			#print normalList.index(lists)," ",lists.index(token)
			tokenList.append(porter.stem(token,0,len(token)-1))
			if token in newDict:
				count = newDict[token]
				newDict[token] = count +1
			else:
				newDict[token] = 1
		newList.append(tokenList)
			#token = porter.stem(token,0,len(token)-1)
			
	return newList,newDict
コード例 #17
0
def GetPddlObj(_sWord):
    global text2pddl;
    if text2pddl == None:
        ReadMinecraftDict(minecraftDictFile);

    setObjs = set();
    p = PorterStemmer();
    sLastWord = p.stem(_sWord.lower(), 0, len(_sWord)-1);
    if sLastWord == 'block': return setObjs;
    #print sLastWord;
    for sTextName in text2pddl.keys():
        if text2pddl[sTextName] == 'NULL': continue;
        lstWords = sTextName.split(' ');
        sLastTextWord = lstWords[len(lstWords)-1];
        if sLastTextWord == 'block':
            if len(lstWords) == 1: continue;
            sLastTextWord = lstWords[len(lstWords)-2];
        if sLastTextWord == sLastWord:
            setObjs.add(text2pddl[sTextName]);
    return list(setObjs);
コード例 #18
0
ファイル: test.py プロジェクト: nreshel/Text-Parser
    def get_postlist(stop_answer, stem_answer, dict_terms):
        if stop_answer == 'no':
            stopwords = []
        if stop_answer == 'yes':
            stopwords = [
                'i', 'a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'by',
                'for', 'from', 'how', 'in', 'is', 'it', 'of', 'on', 'or',
                'that', 'the', 'this', 'to', 'was', 'what', 'when', 'where',
                'who', 'will', 'with', 'the'
            ]
        ps = PorterStemmer()
        position_list = []
        dict_posting = {}
        counter = 0
        for key, value in dict_terms.items():
            if isinstance(value, dict):
                for k, v in value.items():
                    if k == 'abstract':
                        val = v.replace(',', '').lower().split()
                        for index, word in enumerate(val):
                            if stem_answer == 'no':
                                stem_word = word
                            if stem_answer == 'yes':
                                stem_word = ps.stem(word, 0, len(word) - 1)
                            if stem_word not in stopwords:
                                if stem_word not in dict_posting:
                                    dict_posting[stem_word] = {}
                                if key not in dict_posting[stem_word]:
                                    dict_posting[stem_word][key] = {}
                                    dict_posting[stem_word][key][
                                        'frequency'] = 0
                                    dict_posting[stem_word][key][
                                        'position'] = []

                                dict_posting[stem_word][key]['frequency'] += 1
                                dict_posting[stem_word][key][
                                    'position'].append(index)
            with open('posting_list.json', 'w') as outfile:
                json.dump(dict_posting, outfile)
            print("Finished writing the posting list")
        return dict_posting
コード例 #19
0
def ReadMinecraftDict(infoFile):
    global text2pddl;

    fin = open(infoFile)
    lines = fin.readlines()
    fin.close()
    lines = [ line.strip() for line in lines ];

    text2pddl = {}
    p = PorterStemmer()
    for line in lines:
        if len(line)==0: continue;
        parts = line.split(':');
        textName = parts[0];
        words = [ p.stem(w.lower(),0,len(w)-1) for w in textName.split() ]
        textName = ' '.join(words);
        if len(parts)>1:
            pddlName = parts[1];
        else:
            pddlName = parts[0];
        assert(pddlName!='');
        assert(textName!='');
        text2pddl[textName] = pddlName;
コード例 #20
0
ファイル: word.py プロジェクト: powerllamas/EZI
class Cleaner(object):

    def __init__(self, stopwords):
        self.stopwords = stopwords
        self.stemmer = PorterStemmer()

    def clean_word(self, word):
        word = word.strip().lower()
        word = filter(lambda c: c.isalnum(), word)
        if word in self.stopwords:
            word = None
        else:
            word = self.stemmer.stem(word, 0, len(word) - 1)
        return word

    def clean_wordlist(self, wordlist):
        wordlist = " ".join(wordlist).replace('-', ' ').split()
        clean_list = map(lambda x: self.clean_word(x), wordlist)
        return [word for word in clean_list if word]

    @staticmethod
    def make_printable(phrase):
        return filter(lambda c: c in string.printable, phrase)
コード例 #21
0
def GetTFIDF():
	emails = None
	x_vals = []
	y_vals = []
	stemmer = PorterStemmer()
	
	# Get email chains
	with open("balanced_chains.pickle", "rb") as fp1:   # Unpickling
		emails = pickle.load(fp1)

	np.random.shuffle(emails)
	i = 0
	text_data = []
	for i in range(0, len(emails)):
		print "Evaluation Email %d" % (i)
		email, next_email, time_diff, bucket = emails[i]
		# if int(np.round(time_diff / 60)) > 72:
		# 	continue
		# Create stemmed body and append to text_data
		new_str = ""
		words = email['body'].split()
		for word in words:
			new_word = stemmer.stem(word, 0, len(word) - 1)
			new_str += new_word + " "
		new_str = new_str[:-1]
		text_data.append(new_str)

		# Append hour
		y_vals.append(int(np.round(time_diff / 60)))
		#y_vals.append(int(time_diff)

	b = np.array(y_vals)
	count_vect = CountVectorizer()
	X_train_counts = count_vect.fit_transform(text_data)
	tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
	X_train_tf = tf_transformer.transform(X_train_counts)
	return X_train_tf, b, count_vect, tf_transformer, text_data
コード例 #22
0
def process(text):
    '''Returns a list of words after carying out the
    following text preprocessing and normalization steps'''
    # Convert text to lower case
    text = text.lower()
    #Remove 'Subject'
    text = re.sub(r'^sub(ject)?', ' ', text)
    # Strip HTML
    text = re.sub(r'<.*?>', ' ', text)
    # Normalize URLs
    text = re.sub(r'(http|https|ftp)://\S*', ' httpaddr ', text)
    # Normalize email addresses
    text = re.sub(r'[\w.+-]+@[\w.-]+', ' emailaddr ', text)
    # Normalize numbers
    text = re.sub(r'\b\d[\d,]*[.]*[\d]*\b', ' number ', text)
    # Normalize Dollars/Rupees
    text = re.sub(r'(\$|\brs\b|₹|£)+', ' dollar ', text)
    # Remove non-word characters
    text = re.sub(r'[^a-z]+', ' ', text)
    # Strip all whitespace characters and generate list of words
    # Stop Word Removal
    # stop_words = pickle.load(open('stopwords_set.pyset', 'rb'))
    text = [
        word for word in text.split()
        if word not in process.stop_words and len(word) > 2
    ]
    # Word Stemming
    p = PorterStemmer()
    result = []
    for word in text:
        try:
            stem_word = p.stem(word, 0, len(word) - 1)
            if stem_word not in process.stop_words:
                result.append(stem_word)
        except:
            pass
    return result
コード例 #23
0
def vectorSpaceModel(totalDocument, queryFileRead, stopwords):
    """
    Query to calculate the cosine similarity between document d and Query Q
    """

    # Loads the inverted File Hash
    dic = loadFromFile()
    #
    queryList = processQueryList(queryFileRead)
    # Calculation of Inverse Document Frequency
    IDF = calculateIDF(dic, totalDocument)
    # Calculation of Term Frequency
    TF = calculateTFList(dic)
    # Calculation of Wd from all the Term Frequency calculated
    WD = calculateWD(TF, totalDocument)

    pObj = PorterStemmer()
    fileWrite = open("outputdocument.txt", "w")
    for query in queryList:
        fileWrite.write("\n---------------------------------------------------------------------------------------")
        fileWrite.write("\nQuery: " + query)
        # Separate the string of query into list of words
        listQuery = re.findall(r'\w+', query)
        # Remove the stopwords and numbers from the list of query words
        queryWithoutStopword = [x for x in listQuery if x not in stopwords and x.isalpha()]
        # Stem the list of query words
        processedQuery = [pObj.stem(x.lower(), 0, len(x) - 1) for x in queryWithoutStopword]
        # Calculate the cosine measure (Similarity) for the query
        rankedDocList = calculateSimilarity(processedQuery, IDF, WD, totalDocument)
        fileWrite.write("\nTotal number of documents retrieved: " + str(len(rankedDocList)))
        fileWrite.write("\nDocument ID:\n")
        fileWrite.write(''.join(str(rankedDocList)))
        fileWrite.write("\n---------------------------------------------------------------------------------------")
    fileWrite.close()

    print "Writing to outputdocument.txt file completes."
コード例 #24
0
    def buildMatrix(self):
        # use suffix-stripping algorithm to stem word
        porter_strmmer = PorterStemmer()
        for index in range(0,len(self.origin_documents)):
            document = self.origin_documents[index]
            # change document in origin_document array to array of stemmed word
            self.origin_documents[index] = [porter_strmmer.stem(x, 0, len(x) - 1) for x in document.split()]

        # use 2000 most frequent words to generate words array
        temp_word = defaultdict(int)
        for document in self.origin_documents:
            for word in document:
                temp_word[word] += 1

        sorted_dict = sorted(temp_word.items(), key=operator.itemgetter(1))
        sorted_dict.reverse()
        self.words =  [x[0] for x in sorted_dict[0:self.word_size]]


        # build document array
        for index in range(0, len(self.origin_documents)):
            document = self.origin_documents[index]
            self.documents.append([])
            self.documents[index] = [document.count(word) for word in self.words]

        # print(self.documents[0], sum(self.documents[0]))

        # remove zero sum rows
        zeros = [i for i, value in enumerate(self.documents) if sum(value) == 0]
        for value in zeros[::-1]:
            del self.labels[value]
            del self.documents[value]

        # zeros = [i for i, value in enumerate(self.documents) if sum(value) == 0]

        print(len(self.origin_documents), len(self.words), len(self.documents), self.words)
コード例 #25
0
def main():
    # Reading the document from the file
    file = open("cran.all.1400", "r")
    documents = file.read()
    # Reading stop words from the file
    fileStopwords = open('stopwords.txt', 'r')
    stopwordsList = fileStopwords.read()
    stopwords = stopwordsList.split()
    # List that maintains the document id number, number of unique terms in document, for each term in the document, its term and it's term frequency.
    documentList = []
    docId = 1
    # Splits the multiple documents of the same file into list
    document = re.split(".I | \n.I", documents)[1:]

    for doc in enumerate(document):
        startIndex = doc[1].index('.W\n')
        text = doc[1][startIndex + 3:]
        words = re.findall(r'\w+', text)

        pObj = PorterStemmer()
        listWords = {}
        for word in words:
            flagStopwords = word.lower() in stopwords
            if (not flagStopwords and word.isalpha()):
                stemWord = pObj.stem(word, 0, len(word) - 1)
                listWords = addToDict(listWords, stemWord)

        sortedList = sorted(listWords.items(), key=lambda t: t[0])
        output = {'id': docId, 'unique': len(sortedList), 'terms': sortedList}
        docId += 1
        documentList.append(output)

    for i in range(0, len(documentList)):
        print "Document:", documentList[i][
            'id'], "\nUnique Terms:", documentList[i][
                'unique'], "\nTerms:\n", documentList[i]['terms']
コード例 #26
0
def main():
    # Reading the document from the file
    file = open("cran.all.1400", "r")
    documents = file.read()
    # Reading stop words from the file
    fileStopwords = open('stopwords.txt', 'r')
    stopwordsList = fileStopwords.read()
    stopwords = stopwordsList.split()
    # List that maintains the document id number, number of unique terms in document, for each term in the document, its term and it's term frequency.
    docId = 1

    #InvFileHash
    invFileHash = {}
    # Splits the multiple documents of the same file into list
    document = re.split(".I | \n.I", documents)[1:]

    for doc in enumerate(document):
        startIndex = doc[1].index('.W\n')
        text = doc[1][startIndex + 3:]
        words = re.findall(r'\w+', text)

        pObj = PorterStemmer()
        listWords = {}
        for word in words:
            flagStopwords = word.lower() in stopwords
            if (not flagStopwords and word.isalpha()):
                stemWord = pObj.stem(word, 0, len(word) - 1)
                listWords = addToDict(listWords, stemWord)

        docList = addDoc(docId, listWords)
        docId += 1
        invFileHash = createInvFileHash(invFileHash, docList)

    print "File written: output.json"
    print "Number of terms:",len(invFileHash)
    writeToFile(invFileHash)
コード例 #27
0
ファイル: indexer.py プロジェクト: dre21/mongodb_ft
class Indexer():

        # remove stop words and do stemming
        
        STOP_WORD_LIST = ["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","just","keep","keeps","kept","know","knows","known","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","que","quite","qv","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","value","various","very","via","viz","vs","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","would","wouldn't","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","zero"]
        
        def __init__(self):
                logging.debug('Indexer => Init params:self')
                self.idx_fields = []            # field of document to be indexed
                #self.STOP_WORD_LIST = []
                self.P = PorterStemmer()

        # end of function
        '''
        def set_stop_words(self,stop_word_list):
                self.STOP_WORD_LIST = stop_word_list
        # end of function
        '''

        def set_idx_fields(self,fields):
                logging.debug('Indexer => set_idx_fields fields:' + str(fields))
                self.idx_fields = fields
                
        def add_idx_field(self,field_name):
                self.idx_fields.append(field_name)
        
        def clean(self,word):
                #preprocess word
                word = word.lower()
                word = word.strip("\n\t,.(){}?!;'")
                
                if word not in self.STOP_WORD_LIST:
                        word = self.P.stem(word,0,len(word)-1)
                else:
                        word = ""

                return word
        # end of function
        
        def tokenize(self, text):
                #list
                word_idx = []
                        
                # split lines
                lines = text.split('\n')
                
                for line in lines:
                        # split words
                        words = line.split(' ')
                        for word in words:
                                word = self.clean(word)                         
                                if len(word) > 1:
                                        word_idx.append(word)

                # make a set (remove duplicate)
                word_idx = set(word_idx)

                return word_idx
        # end of function
        
        def index(self, document):
                if isinstance(document,list): document = document[0]
                text = ""
                # get text from document to be indexed
                for field in self.idx_fields:
                        text += document[field] + " "
        
                return self.tokenize(text)
        
        def stem(self, words):
                return [self.tokenize(word) for word in words]
コード例 #28
0
 def stemming(self, term):
     output = ""
     stem1 = PorterStemmer()
     output = stem1.stem(term)
     return output
コード例 #29
0
ファイル: help.py プロジェクト: shoosen/ibid
 def stemWord(self, word):
     return PorterStemmer.stem(self, word, 0, len(word) - 1)
コード例 #30
0
ファイル: invert.py プロジェクト: nreshel/Text-Parser
    def index_query(self, q, p_doc, s_answer):
        with open('posting_list.json') as posting_file:
            p_postlist = json.load(posting_file)
        with open('dictionary.json') as dict_file:
            p_dict = json.load(dict_file)
        self.query = q
        self.parsed_doc = p_doc
        self.parsed_postlist = p_postlist
        self.parsed_dicionary = p_dict
        self.stem_answer = s_answer
        index_dict = {}
        first_item = []
        abstract = ''
        summary = ''
        abstract_list = []
        ps = PorterStemmer()
        if self.stem_answer == 'no':
            stem_query = self.query
        if self.stem_answer == 'yes':
            stem_query = ps.stem(self.query, 0, len(self.query) - 1)
        for term, value in self.parsed_postlist.items():
            if stem_query == term:
                if isinstance(value, dict):
                    for doc_id, val in value.items():
                        if term not in index_dict:
                            index_dict[term] = {}
                            index_dict[term]['documents'] = []
                            index_dict[term]['summary'] = ''

                        if self.query in self.parsed_doc[doc_id][
                                'abstract'] and len(first_item) == 0:
                            first_item.append(
                                self.parsed_doc[doc_id]['abstract'])
                            abstract = first_item[0]
                            abstract_list = abstract.split()
                            summary = abstract_list[abstract_list.index(
                                self.query)]
                            count = 0
                            while count < 10:
                                if abstract_list.index(
                                        self.query) + count < len(
                                            abstract_list):
                                    index_dict[term][
                                        'summary'] += abstract_list[
                                            abstract_list.index(self.query) +
                                            count] + ' '
                                else:
                                    break
                                count += 1

                        if doc_id not in index_dict[term]:
                            index_dict[term][doc_id] = {}
                            index_dict[term][doc_id]['position'] = val[
                                'position']
                            index_dict[term][doc_id]['term_frequency'] = val[
                                'frequency']
                            index_dict[term][doc_id][
                                'title'] = self.parsed_doc[doc_id]['title']
                        index_dict[term][
                            'doc_frequency'] = self.parsed_dicionary[term]
                        index_dict[term]['documents'].append(doc_id)
        pprint.pprint(index_dict)
コード例 #31
0
def preprocessing(file_string):
	try:
		f = open(file_string)
		email_contents = f.read()
		#
		#
		#2 去标题
		#
		cut = re.search(r'\n[ \t]*\n',email_contents).span()[1] - 1
		#      \n...\n
		# cut        ↑
		email_contents = email_contents[cut:]
		#
		#
		#3 其他预处理
		#
		#小写 √
		email_contents = email_contents.lower()
		#删HTML标签 √
		#
		email_contents = re.sub(r'<[^<>]+>',' ',email_contents);
		# URL链接替换 → httpaddr √
		# http//https :// xx.xxx.xxx
		email_contents = re.sub(r'http\:\/\/[\w\.\/]+|https\:\/\/[\w\.\/]+','httpaddr',email_contents)
		# 
		# URL邮件地址替换 → emailaddr
		#
		email_contents = re.sub(r'[\w\-\_]+\@[\w]+\.[\w]+','emailaddr',email_contents)
		# 数字替换 →  number  √
		#    整数  小数
		email_contents = re.sub(r'(\d+|\d+\.\d+)','number',email_contents)
		# 美元替换 $ → dollar   √
		# 
		email_contents = re.sub(r'\$','dollar',email_contents)
		# 单词词干化
		#         实现复杂
		# 去除非单词和标点
		# 
		email_contents = re.sub(r'[\W]',' ',email_contents)
		# 删去单字母
		for i in range(50):
			email_contents = re.sub(r' [a-z] ',' ',email_contents)
		# 删去多余空格
		email_contents = re.sub(r'[\t\n ]+',' ',email_contents)
		#

		# from stemmer import PorterStemmer
		# stemmer = PorterStemmer()
		# 
		from stemmer import PorterStemmer
		stemmer = PorterStemmer()
		email_contents = stemmer.stem(email_contents)
		# 
		
		word_list = re.findall(r'\w+',email_contents)
		# return word_list
		for word in word_list:
			if(word_frequency.get(word,"None") == "None"):
				word_frequency[word] = 1;
			else:
				word_frequency[word] += 1;
		# return word_frequency
	except:
		print (file_string + "  ERROR")
		pass
コード例 #32
0
def stemWords(input):
    stem = PorterStemmer()
    for index,entries in enumerate(input):
        input[index]=stem.stem(entries,0,len(entries)-1)
    return input
コード例 #33
0
ファイル: parser.py プロジェクト: nelly-hateva/torchwood
total_files = 0
p = PorterStemmer()

if not os.path.exists(corpus_dir):
    os.makedirs(corpus_dir)
os.chdir(data_dir)

for file in glob.glob('*.sgm'):
    current_file = os.path.join(data_dir, file)
    print 'Extract files from file %s' % current_file
    soup = BeautifulSoup(open(current_file))
    for document in soup.find_all('reuters'):
        new_file = os.path.join(corpus_dir, document.get('newid'))
        with open(new_file, "wb") as extracted_file:
            read_data = document.get_text().encode('utf-8')
            clean_data = re.sub(r'/', ' / ', read_data)
            clean_data = re.sub(r'-', ' - ', clean_data)
            """
            The punctuations contained in the string.punctuation are
            !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
            """
            clean_data = read_data.translate(None, string.punctuation)
            clean_data = clean_data.lower()
            output = ''
            for word in clean_data.split():
                output += p.stem(word, 0, len(word) - 1)
                output += ' '
            extracted_file.write(output)
            total_files += 1
print 'Total files extracted %s' % total_files
コード例 #34
0
def run_train_test(training_file, testing_file):

    # Set the variables, params, dicts, sets
    alpha = 0.5

    stop_words = {'the', 'and'}
    logic_negation = {'t', 'not', 'no', 'never', 'dont', 'didnt', 'doesnt'}
    Porter_Stemmer = PorterStemmer()

    # Import training dataset
    training_start_time = time.time()
    vocab = set(['positive-words', 'negative-words'])
    wordcount_class_0 = {'positive-words': 0, 'negative-words': 0}
    wordcount_class_1 = {'positive-words': 0, 'negative-words': 0}
    total_reviews = 0
    reviewscount_0 = 0
    reviewscount_1 = 0
    train_labels = []
    train_reviews = []
    with training_file as f:
        for line in f:
            review, label = line.split(',')
            words = review.split(' ')
            del words[-1]
            label = int(label.strip("\n"))

            total_reviews += 1

            # Implement negation: add NOT_ to words after logical negation
            for i in range(len(words)):
                words[i] = Porter_Stemmer.stem(words[i])
                if words[i] in logic_negation:
                    try:
                        words[i + 1] = 'NOT_' + words[i + 1]
                    except:
                        continue
                    try:
                        words[i + 2] = 'NOT_' + words[i + 2]
                    except:
                        continue
                    try:
                        words[i + 3] = 'NOT_' + words[i + 3]
                    except:
                        continue

            bigrams = []
            for i in range(len(words) - 1):
                bigram = words[i] + ' ' + words[i + 1]
                bigrams.append(bigram)

            words = set(bigrams)

            # words = set(words)
            vocab.update(words)

            for word in words:
                if word not in wordcount_class_0.keys():
                    wordcount_class_0[word] = 0
                    wordcount_class_1[word] = 0

            if label == 0:
                reviewscount_0 += 1
                for word in words:
                    wordcount_class_0[word] += 1
                    # # Analyze Sentiment lexicons
                    # unigram1, unigram2 = word.split(' ')
                    # if unigram1 in lexicons.pos_words:
                    #     wordcount_class_0['positive-words'] += 1
                    # if unigram1 in lexicons.neg_words:
                    #     wordcount_class_0['negative-words'] += 1
                    # if unigram2 in lexicons.pos_words:
                    #     wordcount_class_0['positive-words'] += 1
                    # if unigram2 in lexicons.neg_words:
                    #     wordcount_class_0['negative-words'] += 1

            if label == 1:
                reviewscount_1 += 1
                for word in words:
                    wordcount_class_1[word] += 1
                    # # Analyze Sentiment lexicons
                    # unigram1, unigram2 = word.split(' ')
                    # if unigram1 in lexicons.pos_words:
                    #     wordcount_class_0['positive-words'] += 1
                    # if unigram1 in lexicons.neg_words:
                    #     wordcount_class_0['negative-words'] += 1
                    # if unigram2 in lexicons.pos_words:
                    #     wordcount_class_0['positive-words'] += 1
                    # if unigram2 in lexicons.neg_words:
                    #     wordcount_class_0['negative-words'] += 1

            train_labels.append(label)
            train_reviews.append(words)

    # Compute CPTs
    P_class = [0, 0]
    P_class[0] = reviewscount_0 / total_reviews
    P_class[1] = reviewscount_1 / total_reviews

    P_words_class_0 = {}
    P_words_class_1 = {}
    bottom_0 = sum(wordcount_class_0.values()) + alpha * len(vocab)
    bottom_1 = sum(wordcount_class_1.values()) + alpha * len(vocab)
    for word in vocab:
        if word in stop_words:
            P_words_class_0[word] = (0 + alpha) / bottom_0
            P_words_class_1[word] = (0 + alpha) / bottom_1
        else:
            P_words_class_0[word] = (wordcount_class_0[word] +
                                     alpha) / bottom_0
            P_words_class_1[word] = (wordcount_class_1[word] +
                                     alpha) / bottom_1

    # Inference on the training dataset
    predict_train_labels = []
    for doc in train_reviews:
        log_sum_0 = 0
        log_sum_1 = 0
        bag_of_words = set(doc)
        for word in bag_of_words:
            log_sum_0 += log(P_words_class_0[word])
            log_sum_1 += log(P_words_class_1[word])

            # # Sentiment Analysis
            # unigram1, unigram2 = word.split(' ')
            # if unigram1 in lexicons.pos_words:
            #     log_sum_0 += log(P_words_class_0['positive-words'])
            #     log_sum_1 += log(P_words_class_1['positive-words'])
            # if unigram1 in lexicons.neg_words:
            #     log_sum_0 += log(P_words_class_0['negative-words'])
            #     log_sum_1 += log(P_words_class_1['negative-words'])
            # if unigram2 in lexicons.pos_words:
            #     log_sum_0 += log(P_words_class_0['positive-words'])
            #     log_sum_1 += log(P_words_class_1['positive-words'])
            # if unigram2 in lexicons.neg_words:
            #     log_sum_0 += log(P_words_class_0['negative-words'])
            #     log_sum_1 += log(P_words_class_1['negative-words'])

        Prob_c0 = log(P_class[0]) + log_sum_0
        Prob_c1 = log(P_class[1]) + log_sum_1
        if Prob_c0 > Prob_c1:
            c = 0
        else:
            c = 1
        predict_train_labels.append(c)

    # Compute training accuracy
    correct = 0
    for i in range(len(train_labels)):
        if predict_train_labels[i] == train_labels[i]:
            correct += 1
    train_accuracy = correct / len(train_labels)

    training_time = time.time() - training_start_time

    # Import testing dataset
    testing_start_time = time.time()
    test_reviews = []
    test_labels = []
    with testing_file as f:
        for line in f:
            review, label = line.split(',')
            words = review.split(' ')
            del words[-1]
            label = int(label.strip("\n"))

            # Implement negation: add NOT_ to words after logical negation
            for i in range(len(words)):
                words[i] = Porter_Stemmer.stem(words[i])
                if words[i] in logic_negation:
                    try:
                        words[i + 1] = 'NOT_' + words[i + 1]
                    except:
                        continue
                    try:
                        words[i + 2] = 'NOT_' + words[i + 2]
                    except:
                        continue
                    try:
                        words[i + 3] = 'NOT_' + words[i + 3]
                    except:
                        continue

            bigrams = []
            for i in range(len(words) - 1):
                bigram = words[i] + ' ' + words[i + 1]
                bigrams.append(bigram)

            words = set(bigrams)
            # words = set(words)

            test_labels.append(label)
            test_reviews.append(words)

    # Inference on the testing dataset
    predict_test_labels = []
    for doc in test_reviews:
        log_sum_0 = 0
        log_sum_1 = 0
        bag_of_words = set(doc)
        bag_of_words = vocab.intersection(bag_of_words)
        for word in bag_of_words:
            log_sum_0 += log(P_words_class_0[word])
            log_sum_1 += log(P_words_class_1[word])
            # # Sentiment Analysis
            # unigram1, unigram2 = word.split(' ')
            # if unigram1 in lexicons.pos_words:
            #     log_sum_0 += log(P_words_class_0['positive-words'])
            #     log_sum_1 += log(P_words_class_1['positive-words'])
            # if unigram1 in lexicons.neg_words:
            #     log_sum_0 += log(P_words_class_0['negative-words'])
            #     log_sum_1 += log(P_words_class_1['negative-words'])
            # if unigram2 in lexicons.pos_words:
            #     log_sum_0 += log(P_words_class_0['positive-words'])
            #     log_sum_1 += log(P_words_class_1['positive-words'])
            # if unigram2 in lexicons.neg_words:
            #     log_sum_0 += log(P_words_class_0['negative-words'])
            #     log_sum_1 += log(P_words_class_1['negative-words'])
        Prob_c0 = log(P_class[0]) + log_sum_0
        Prob_c1 = log(P_class[1]) + log_sum_1
        if Prob_c0 > Prob_c1:
            c = 0
        else:
            c = 1
        # print(c)
        predict_test_labels.append(c)

    # Compute testing accuracy
    correct = 0
    for i in range(len(test_labels)):
        if predict_test_labels[i] == test_labels[i]:
            correct += 1
    test_accuracy = correct / len(test_labels)

    # Print results
    testing_time = time.time() - testing_start_time
    print(round(training_time), 'seconds (training)')
    print(round(testing_time), 'seconds (labeling)')
    print(round(train_accuracy, 3), '(training)')
    print(round(test_accuracy, 3), '(testing)')
    print(len(vocab))

    return
コード例 #35
0
ファイル: githater.py プロジェクト: kristopolous/githater
  'file' : {
    'ls-time': 1
  },
  'ls' : {
    'ls-time': 1,
    'tag-time': 1
  },
  'refspec': {
    'new-branch-push': 2
  }
}

computed = {}
stemmer = PorterStemmer()
for word in sys.argv[1:]:
  word = stemmer.stem(word.lower())
  print word
  if word in synonymMap:
    word = synonymMap[word]

  if word in weightMap:
    for key, value in weightMap[word].iteritems():
      if key in computed:
        computed[key] += value
      else:
        computed[key] = value

sorted_computed = sorted(computed.iteritems(), key=operator.itemgetter(1))

sorted_computed.reverse()
コード例 #36
0
ファイル: functions.py プロジェクト: arnov/air2010
def stem_word(sentence):
    for i in range(len(sentence)):
        p = PorterStemmer()
        #sentence[i] = sentence[i].lower()
        sentence[i] = p.stem(sentence[i], 0, len(sentence[i]) - 1)
    return sentence
コード例 #37
0
ファイル: neural_net.py プロジェクト: KevinKhieu/cs229Project
def GetFeatures(emails):

    #np.random.shuffle(emails)
    word_mapping = GetWordDictionary(emails)

    # with open("balanced_chains_shuffed.pickle", "wb") as fp2:
    # 	pickle.dump(emails, fp2, protocol=pickle.HIGHEST_PROTOCOL)
    stemmer = PorterStemmer()
    training_cutoff = int(len(emails) * 0.9)
    x_vals = []
    y_vals = []
    count_0 = 0
    count_1 = 0
    for i in range(0, len(emails)):
        #print "Evaluation Email %d" % (i)
        email, next_email, time_diff, bucket = emails[i]  #, bucket

        if (float(np.round(time_diff / 60)) > 24):
            continue
        num_features = 9  #15 #13
        num_words = 0  #len(word_mapping)
        # Create feature array
        features = np.zeros(shape=(num_features + num_words))

        #Feature 1: Number of to
        features[0] = float(len(email['to']))

        # Feature 2: Num words
        words = email['body'].split()
        lower_case_body = [
            stemmer.stem(x.lower(), 0,
                         len(x) - 1) for x in words
        ]
        features[1] = float(len(words))

        # print email
        # print bucket
        # print lower_case_body
        # print "-------------\n\n"

        #Feature 3: Number of CC
        # Old: 0.5442, 0.5387
        features[2] = float(email['cc_count'])

        # Feature 4: is reply
        # without, .6298, with. .5994
        if email['is_re']:
            features[3] = 1.0
        else:
            features[3] = 0.0

        # Feature 5: Time of Day (hour)
        #print email
        date = email['date']['local_date']
        hour = date.hour
        # Old, 0.5442, New = 0.5276
        features[4] = float(hour)

        #Feature 6: Length of Subject Line
        subject_words = email['subject'].split()
        lower_case_words = [
            stemmer.stem(x.lower(), 0,
                         len(x) - 1) for x in subject_words
        ]
        features[5] = int(len(subject_words))

        # Feature 7: Day of Week
        # without, 0.5580
        features[6] = (date.weekday())

        # Feature 8: Question marks in Body
        features[7] = (email['body'].count('?'))

        # Feature 9: Question marksin Subject
        features[8] = (email['subject'].count('?'))

        # # Feature 10: "RESPONSE NEEDED" in subject
        # stemmed_response = stemmer.stem("response", 0, len("response") - 1)
        # if stemmed_response in lower_case_words:
        # 	features[9] = 1
        # else:
        # 	features[9] = 0

        # # Feature 11: "Please" in words
        # stemmed_please = stemmer.stem("please", 0, len("please") - 1)
        # if stemmed_please in lower_case_words:
        # 	features[10] = 1
        # else:
        # 	features[10] = 0

        # if email['body'].find('?') != -1:
        # 	features[11] = 1
        # else:
        # 	features[11] = 0

        # if email['subject'].find('?') != -1:
        # 	features[12] = 1
        # else:
        # 	features[12] = 0

        # stemmed_can = stemmer.stem("can", 0, len("can") - 1)
        # if stemmed_can in lower_case_words:
        # 	features[13] = 1
        # else:
        # 	features[13] = 0

        # stemmed_important = stemmer.stem("important", 0, len("important") - 1)
        # if stemmed_important in lower_case_words:
        # 	features[14] = 1
        # else:
        # 	features[14] = 0

        # if stemmed_important in lower_case_body:
        # 	features[15] = 1
        # else:
        # 	features[15] = 0

        # stemmed_urgent = stemmer.stem("urgent", 0, len("urgent") - 1)
        # if stemmed_urgent in lower_case_words:
        # 	features[16] = 1
        # else:
        # 	features[16] = 0

        # if stemmed_urgent in lower_case_body:
        # 	features[17] = 1
        # else:
        # 	features[17] = 0

        # stemmed_need = stemmer.stem("need", 0, len("need") - 1)
        # if stemmed_need in lower_case_words:
        # 	features[18] = 1
        # else:
        # 	features[18] = 0

        # if stemmed_need in lower_case_body:
        # 	features[19] = 1
        # else:
        # 	features[19] = 0

        # body = SplitText(email['body'])
        # for word in body:
        # 	modified_word = word
        # 	if len(modified_word) > 1:
        # 		modified_word = stemmer.stem(word, 0, len(word) - 1).lower()

        # 	if modified_word in word_mapping:
        # 		features[word_mapping[modified_word]] = 1

        x_vals.append(features)
        y_vals.append(bucket)
        #y_vals.append(float(np.round(time_diff / 60)))
        #Append y_value for training point
        # if float(time_diff) > 60.0:
        # 	y_vals.append(1)
        # 	count_1 += 1
        # else:
        # 	y_vals.append(0)
        # 	count_0 += 1
        #y_vals.append(float(np.round(time_diff / 60)))

    #print count_0
    #print count_1
    return x_vals, y_vals
コード例 #38
0
class Parse:
    def __init__(self):
        self.stop_words = stopwords.words('english')
        self.dictionary_term_index = {}
        self.array_names_and_entities = {}
        self.porter_stemmer = PorterStemmer()

    def parse_sentence(self, text, stemmer=False):
        """
        This function tokenize, remove stop words and apply lower case for every word within the text
        :param text:
        :return:
        """
        list_percent = ["percent", "Percent", "Percentage", "percentage"]
        self.array_names_and_entities = {}
        self.dictionary_index = {}
        text = text.replace("\n", ". ")
        text = self.ignore_emojis(text)
        array_text_space = text.split(" ")
        array_text_space = self.separate_words_with_dots(array_text_space)
        string_ans = ""
        array_size = range(len(array_text_space))
        string_ans_index = 0
        entities_url = []  # help us to replace the url to "" because in get_entities it returns parts of the url
        for word, idx in zip(array_text_space, array_size):
            ans = ""
            if word == '' or word == ' ': continue
            check_digit = self.isdigit(word)
            if len(word) < 2 and check_digit is False: continue
            if len(word) < 2 or self.is_ascii(word) is False:
                if check_digit is False:
                    word = self.remove_panctuation(word)
                    if self.is_ascii(word) is False or word == '' or word == " " or len(
                            word) < 2 or word.lower() not in self.stop_words:
                        continue
            if ans == "" and self.is_url(word):
                entities_url.append(word)
                if "t.co" in word: continue
                ans = self.parse_url(word)
                if ans == "":
                    entities_url.remove(word)
                    continue
            else:
                if ans == "" and len(word) < 2 and word[0] != '#' and self.is_ascii(word) and not self.isfloat(word):
                    word = self.remove_panctuation(word)
            if ans == "" and word[0] == '#':
                temp_word = self.remove_panctuation(word)
                if temp_word == "" or temp_word == "#":
                    continue
                ans = self.parse_hashtag(temp_word)
            elif ans == "" and word[0] == '@':
                ans = self.remove_panctuation(word)
            elif ans == "" and word in list_percent:
                if idx > 0 and self.isfloat(array_text_space[idx - 1]):
                    ans = self.parse_percentage(array_text_space[idx - 1] + " " + word)
                    string_ans = string_ans[:len(string_ans) - 1 - len(ans)] + string_ans[
                                                                               len(string_ans) + len(word):] + " "
                else:
                    ans = word
            elif ans == "" and (word.lstrip('-').isdigit() or self.isfloat(word.lstrip('-')) or self.isFraction(
                    word.lstrip('-')) or word.replace('~', '').isdigit()):
                ans = self.convert_str_to_number(array_text_space, idx)
            if ans == "":
                pre_ans = self.remove_panctuation(word)
                if len(pre_ans) < 2: continue
                array_ans = pre_ans.split()
                for word_array in array_ans:
                    if word_array.lower() in self.stop_words: continue
                    string_ans += self.add_to_dictionary(word_array.lower(), string_ans_index)
                    string_ans_index += len(word) + 1
            else:
                string_ans += self.add_to_dictionary(ans, string_ans_index)
                string_ans_index += len(word) + 1

        self.get_name_and_entities(entities_url, array_text_space)
        array_parsed = string_ans.split()
        return array_parsed, self.array_names_and_entities

    def separate_words_with_dots(self, array_text):
        new_text = ""
        length = range(len(array_text))
        for i in length:
            word = array_text[i]
            if '.' not in word:
                new_text += word + " "
                continue
            if "http" in word or "www" in word or "t.co" in word or self.isfloat(word):
                new_text += word + " "
                continue

            separate = str(word).split('.')
            new_text += separate[0] + ". " + separate[1] + " "
        return new_text.lstrip().split(" ")

    def is_url(self, text):
        '''
        check if string is a url path
        :param text: url
        :return: boolean
        '''
        regex = re.compile(
            r'^(?:http|ftp)s?://|(?:www)?.'  # http:// or https://
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...
            r'localhost|'  # localhost...
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
            r'(?::\d+)?'  # optional port
            r'(?:/?|[/?]\S+)$', re.IGNORECASE)

        return re.match(regex, text) is not None

    def add_to_dictionary(self, text, index):
        array_of_words = text.split(" ")
        ans = ""
        for word in array_of_words:
            ans += word + " "
            self.dictionary_index[word] = index
        if ans == "": return ""
        return ans

    def parse_hashtag(self, phrase):
        """"
        parser hash tag and lower the letters
        return array of string
        #stayAtHome -> ['#stayathome',stay,at,home]
        """
        original_phrase = phrase
        pattern = re.compile(r"[A-Z][a-z]+|\d+|[A-Z]+(?![a-z])")
        if phrase[1].islower() and '_' not in original_phrase:
            phrase = phrase[:1] + phrase[1].upper() + phrase[2:]
        temp = pattern.findall(phrase)
        all_words = phrase[1:].split("_")
        for word in all_words:
            if word != phrase[1:] and word.lower() and word not in temp: temp.append(word)
        temp = [str_to_lower.lower() for str_to_lower in temp]
        temp.insert(0, original_phrase[0:len(original_phrase)].lower().replace('_', ''))
        i=0
        len_temp =len(temp)
        while i < len_temp:
            if temp[i] in self.stop_words or len(temp[i]) < 2:
                temp[i] = ''
            i += 1
        # for word in temp:
        #     if word in self.stop_words or len(word) < 2:
        #         temp.remove(word)
        return " ".join(temp).lstrip().rstrip()

    def parse_url(self, string):
        """
        parsing url path
        return an array of the components
        """
        if string is not None:
            ans = string.split("/")
            #r = re.split('[/://?=-]', string)
            #ans = " ".join(r).lstrip()
            #for term in ans:
           #     term = re.sub(r"http\S+", "", ans)
            #ans = "".join(ans).strip().split()
            ans_len = len(ans)
            remove_www = ""
            if ans_len > 0:
                for term in ans:
                    remove_www += term.replace("www.", "") + " "
                ans[0] = ans[0].replace(ans[0], remove_www)
                string_without_stopword = ""
                length = range(len(ans))
                ans_string = ans[0].split(" ")
                for word, idx in zip(ans_string, length):
                    if word == '' or word == ' ': continue
                    if len(word) < 2 or (len(word) > 0 and word[0] == '#'): continue
                    if word not in self.stop_words or word.isnumeric():
                        if not self.is_url(word):
                            word = self.remove_panctuation(word)
                        string_without_stopword += word + " "
                return string_without_stopword.lstrip()
            else:
                return ""

    def isdigit(self, word):
        if "0" <= word <= "9":
            return True
        return False

    def isfloat(self, value):
        """
            check if value is a float number
        :return: boolean
        """
        try:
            float(value)
            return True
        except ValueError:
            return False

    def isFraction(self, token):
        """
        check if value is a fraction number
        :return: boolean
        """
        if '/' not in token:
            return False
        values = token.split('/')
        return all(i.isdigit() for i in values)

    def convert_str_to_number_kmb(self, word):
        """
                check if value is a float number, and return the wanted number. etc: 1000->1K, 1013456->1.013M
                :return: boolean
                """
        tmb = ''
        if word >= 1000000000 or word <= -1000000000:
            word = float(word / 1000000000)
            tmb = 'B'
        elif word >= 1000000 or word <= -1000000:
            word = float(word / 1000000)
            tmb = 'M'
        elif word >= 1000 or word <= -1000:
            word = float(word / 1000)
            tmb = 'K'
        ans = '{:0.3f}'.format(word)
        return '{0:g}'.format(float(ans)) + tmb

    def convert_str_to_number(self, text_demo, idx):
        """
        check every type of number and return it as a string. etc: 1K,1M,1B,-900,23/5,2020,2K
        :return: boolean
        """
        help_minus = ''
        text_return = []
        my_word = text_demo[idx]
        text_demo_length = len(text_demo)
        my_word = my_word.replace(",", "")
        if re.search('-', my_word):
            help_minus = '-'
            my_word = my_word.replace("-", "")
        if not self.isfloat(my_word): my_word = self.remove_panctuation(my_word)
        if self.isFraction(my_word):
            if idx + 1 == text_demo_length:
                return ''.join(help_minus + my_word)
            text_return = ''.join(help_minus + my_word)
            token_next = text_demo[idx + 1].lower()
            if token_next == "billion" or token_next == "billions":
                text_return += 'B'
                text_demo[idx + 1] = ""
            if token_next == "million" or token_next == "millions":
                text_return += 'M'
                text_demo[idx + 1] = ""
            if text_demo[idx + 1] == "thousand" or token_next == "thousands":
                text_return += 'K'
                text_demo[idx + 1] = ""
            return help_minus + ''.join(text_return)
        if my_word != '' and not math.isnan(float(my_word)):
            number = float(my_word)
            number_numerize = self.convert_str_to_number_kmb(number)
            if idx + 1 < len(text_demo):
                token_next = text_demo[idx + 1].lower()
                number_to_input = str(number_numerize)
                if token_next == "billion" or token_next == "billions":
                    if 'K' in number_numerize or 'M' in number_numerize:
                        number_to_input = (number_to_input.translate({ord('K'): None}))
                        number_to_input = (number_to_input.translate({ord('M'): None}))
                        text_return.append(my_word)
                    else:
                        text_return.append(str(number_numerize + 'B'))
                    text_demo[idx + 1] = ""

                elif token_next == "million" or token_next == "millions":
                    if 'K' in number_numerize:
                        number_to_input = (number_to_input.translate({ord('K'): None}))
                        text_return.append(number_to_input + 'B')
                    else:
                        number_to_input = str(number_numerize)
                        text_return.append(number_to_input + 'M')
                    text_demo[idx + 1] = ""
                elif token_next == "thousand" or token_next == "thousands":
                    if 'K' in number_numerize:
                        number_to_input = (number_to_input.translate({ord('K'): None}))
                        text_return.append(number_to_input + 'M')
                    elif 'M' in number_numerize:
                        number_to_input = (number_to_input.translate({ord('M'): None}))
                        text_return.append(number_to_input + 'B')
                    else:
                        text_return.append(number_to_input + 'K')
                    text_demo[idx + 1] = ""
                elif 1000 > number > -1000:
                    text_return.append(number_numerize)
                else:
                    text_return.append(number_numerize)
            else:
                text_return.append(number_numerize)
            if 1900 < number < 2100 and help_minus == '':
                if '~' in text_demo[idx]:
                    text_return.append(my_word)
                else:
                    text_return.append(text_demo[idx])
        return help_minus + ' '.join(text_return)

    def ignore_emojis(self, text):
        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002500-\U00002BEF"  # chinese char
                                   u"\U00002702-\U000027B0"
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   u"\U0001f926-\U0001f937"
                                   u"\U00010000-\U0010ffff"
                                   u"\u2640-\u2642"
                                   u"\u2600-\u2B55"
                                   u"\u200d"
                                   u"\u23cf"
                                   u"\u23e9"
                                   u"\u231a"
                                   u"\ufe0f"  # dingbats
                                   u"\u3030"
                                   "]+", flags=re.UNICODE)
        ans = emoji_pattern.sub(r'', text)
        return ans

    def is_ascii(self, s):
        ans = all(ord(c) < 128 or c == '…' or c == '’' or c == '³' or c == "¹⁹" for c in s)
        return ans

    def parse_percentage(self, string):
        """
        change word to percent
        100 percent -> 100%
        :param string: string to check if there is a percent within
        :return: array of converted strings
        """
        return re.split('\s+', string)[0] + '%'

    def remove_panctuation(self, word):
        """
                remove pancuations from word (like . or , or : )
                :param word
                :return: word without panctuation
                """
        # chars = set('.,:;!()[]{}?=+…$&')
        if re.match(r'[^@]+@[^@]+\.[^@]+', word): return word
        if "#" == word or "##" == word: return ""
        if word[-2:] == "'s" or word[-2:] == "’s" or word[-2:] == "`s": word = word.replace(word[-2:], "")
        smiles = [":)", ":(", ":-]", ":-)", ";)", ";-)", ":-(", ";(", ";-(", ":-P", ":P", ":p", ":-p"]
        for smile in smiles:
            if smile in word: word = word.replace(smile, "")
        if word in smiles: return ''
        if "\n" in word: word = word.replace("\n", " ")
        if '#' in word and word[0] != '#': word = word.replace("#", "")
        if '_' in word and '#' not in word:
            word = word.replace("_", "")
        if '@' in word and word[0] != '@': word = word.replace("@", "")

        word = word.replace("-", " ")
        word = word.replace("'", "")
        word = re.sub(r'[€£€4️⃣“”‘⁦⁩‼⑥²⁸¹❶❷❽②⑦&$~’.,!…|?,…:;^"{}*=+()⁰\/[\[\]]', '', word)
        return word

    def get_name_and_entities(self, entities_url, array_text_space):
        text = ""
        for word in array_text_space:
            if word == '' or word == '' or word[0] == '@' or word[0] == '#' or word == "RT": continue
            text += word + " "

        rx2 = re.compile(r'[A-Z][-a-zA-Z]+[1-9]*(?:\s+[A-Z][-a-zA-Z]+[1-9]*)*')
        matches = rx2.findall(text)
        tokinzed_entity_new = set()
        i = 0
        for i in range(len(matches)):
            if len(str(matches[i]).split()) > 1:
                tokinzed_entity_new.add(str(matches[i]))
                i += 1
        if "COVID 19" in text: tokinzed_entity_new.add("COVID 19")
        if "Covid 19" in text: tokinzed_entity_new.add("Covid 19")

        for word in tokinzed_entity_new:
            if word.lower() not in self.stop_words:
                all_places = [m.start() for m in re.finditer(word, text)]
                self.array_names_and_entities[word] = all_places
        return tokinzed_entity_new

    def parse_doc(self, doc_as_list,stemmer=False):
        """
        This function takes a tweet document as list and break it into different fields
        :param doc_as_list: list re-preseting the tweet.
        :return: Document object with corresponding fields.
        """
        tweet_id = doc_as_list[0]
        tweet_date = doc_as_list[1]
        full_text = doc_as_list[2]
        url = doc_as_list[3]
        indices = doc_as_list[4]
        retweet_text = doc_as_list[5]
        retweet_url = doc_as_list[6]
        retweet_indices = doc_as_list[7]
        quote_text = doc_as_list[8]
        quote_url = doc_as_list[9]
        quote_indices = doc_as_list[10]
        term_dict = {}
        entities_local_dict = {}
        array_url_parsed = []
        url = str(url)
        rt = False
        if "RT" in full_text:
            rt = True
        if url != "{}" and "null" not in url:
            dict2 = eval(url)
            keys = dict2.keys()
            for key in keys:
                if dict2[key] != str("null") and "t.co" not in dict2[key]:
                    url_parsed = self.parse_url(dict2[key])
                    check = url_parsed.split()
                    for word in check:
                        array_url_parsed.append(word)

        tokenized_text, names_and_entities = self.parse_sentence(full_text, stemmer=False)
        doc_length = len(tokenized_text)  # after text operations.
        if doc_length == 0:
            return None

        for term in tokenized_text:
            if len(term) < 2: continue
            if stemmer:
                term = self.porter_stemmer.stem(term)
            if term not in term_dict.keys():
                term_dict[term] = 1
            else:
                term_dict[term] += 1
        for term in array_url_parsed:
            if len(term) < 2: continue
            if stemmer:
                term = self.porter_stemmer.stem(term)
            if term.lower() in self.stop_words or term == 'http' or term == 'https' or term == 'www':
                continue
            if term not in term_dict.keys():
                term_dict[term] = 1
            else:
                term_dict[term] += 1
        for term in names_and_entities.keys():
            if len(term) < 2: continue
            if term in self.stop_words:
                continue
            if term not in term_dict.keys():
                term_dict[term] = 1
            else:
                term_dict[term] += 1

        document = Document(tweet_id, tweet_date, full_text, url, retweet_text, retweet_url, quote_text,
                            quote_url, term_dict, len(self.array_names_and_entities), rt, doc_length)
        return document
コード例 #39
0
ファイル: tools.py プロジェクト: davedash/mealadvisor
def stem(word):
    p = PorterStemmer()

    return p.stem(word, 0, len(word)-1)
コード例 #40
0
ファイル: help.py プロジェクト: B-Rich/ibid-1
 def stemWord(self, word):
     return PorterStemmer.stem(self, word, 0, len(word) - 1)
コード例 #41
0
ファイル: RawFreq.py プロジェクト: yipeiw/twitter
    fout=open(boffile, 'w')
    
    wordDict=defaultdict(int)
    wordDict.clear()

    #go through all tweets and count for the number of each term
    while True:
        line = fin.readline()
        if not line:
            break

        text = Tparse.GetText(line.strip())
        tokens = re.split('\s+|_', text)
        for token in tokens:
            word = re.sub("[^a-z]", "", token.lower())
            word = stemmer.stem(word)
            if len(word)>0:
                wordDict[word] += 1
   
    #calculate frequency and output result
    output_vector = []
    total = sum([item for index, item in wordDict.items()])
    output_vector = [(index, float(val)/float(total)) for index, val in wordDict.items()]
    output_vector = sorted(output_vector, key=lambda item:item[0])
    
    for index, value in output_vector:
        fout.write("%s %s\n" % (index, value))

    fout.close()
    
コード例 #42
0
class pre_worker(object):

    r_stopwords = None    #停用词表
    c_stopwords_lock = threading.Lock()

    def __init__(self,name,task_queue,result_queue):
        self.name = name
        self.r_stemmer = PorterStemmer()
        self.queue = task_queue
        self.queue2 = result_queue
        #载入停词表
        pre_worker.load_stopwords()

    def work(self):
        
        process_print(u'%s开始工作' % (self.name,))

        self.start_time = time.clock()
        count = 0
        while 1:
            content_list = self.queue.get()  #从任务队列里取出n篇文章的所有内容
            if content_list == None:
                self.queue2.put(None)
                break
            count +=len(content_list)
            for content in content_list:
                result_list = self.__parse_doc(content)
                self.queue2.put(result_list)    #将结果存入输出队列中
        process_print(u' %s已退出,共处理了%d个任务' % (self.name,count))

    @staticmethod
    def load_stopwords():
        if pre_worker.r_stopwords == None:
            pre_worker.c_stopwords_lock.acquire()
            try:
                f = open('stopwords.txt')
                content = f.readline().split(',')
                f.close()
                pre_worker.r_stopwords = set(content)
                #print u'停词表载入成功',len(content)
            finally:
                pre_worker.c_stopwords_lock.release()

    def __parse_doc(self,content):
        items = re.findall(setting.c_reg_doc,content)
        result_list = []
        for item in items:
            #获得doc的name
            result = re.search(setting.c_reg_doc_name,item)  
            if result == None:
                continue
            name = result.group(1).strip()
            #获得doc的content
            result = re.search(setting.c_reg_doc_text,item)
            if result == None:
                continue
            text_content = result.group(1)
            term_list = self.__parse_text(text_content)
            if len(term_list) == 0:
                continue
            result_list.append((name,' '.join(term_list)))
        return result_list
    def __parse_text(self,text_content):
        #转换文档正文,分词
        begin = 0
        term_list = []
        for index in range(0,len(text_content)):
            if text_content[index] not in setting.c_valid_chars:
                #词的长度要大于等于最小长度setting.c_min_length
                if index - begin < setting.c_min_length:
                    begin = index + 1
                    continue
                content = text_content[begin:index].lower()
                #获取词根
                content = self.r_stemmer.stem(content,0,len(content) - 1)
                begin = index + 1
                #变为词根之后如果长度太小,依然舍弃
                if len(content) < setting.c_min_length:
                    continue
                #去停用词
                if content in pre_worker.r_stopwords:
                    continue
                term_list.append(content)
            
        return term_list

    def parse_topic(self,filename):
        content = get_all_text(filename)
        items = re.findall(setting.c_reg_topic,content)
        result_list = []
        for item in items:
            result = re.search(setting.c_reg_topic_id,item)
            if result == None:
                continue
            id = result.group(1)
            index = item.index('<title>')
            text_content = re.sub(setting.c_reg_sub,' ',item[index:])
            term_list = self.__parse_text(text_content)
            if len(term_list) == 0:
                continue
            result_list.append((id,' '.join(term_list)))
        return result_list
コード例 #43
0
def GetDataset():
    np.random.shuffle(emails)

    x_vals = []
    y_vals = []

    stemmer = PorterStemmer()
    word_mapping = GetWordDictionary(emails)

    i = 0
    text_data = []
    for i in range(0, len(emails)):
        #print "Evaluation Email %d" % (i)
        # note: time diff in mins
        email, next_email, time_diff, label = emails[i]

        # Create feature array
        features = []

        #Feature 1: Number of to
        features.append(len(email['to']))

        # Feature 2: Num words
        words = email['body'].split()
        lower_case_body = [
            stemmer.stem(x.lower(), 0,
                         len(x) - 1) for x in words
        ]
        features.append(len(words))

        # Feature 3: Number of CC
        features.append(email['cc_count'])

        # Feature 4: is reply
        if email['is_re']:
            features.append(1)
        else:
            features.append(0)

        # Feature 5: Time of Day (hour)
        date = email['date']['local_date']
        hour = date.hour
        features.append(hour)

        # Feature 6: Length of Subject Line
        subject_words = email['subject'].split()
        lower_case_subject = [
            stemmer.stem(x.lower(), 0,
                         len(x) - 1) for x in subject_words
        ]
        features.append(len(subject_words))

        # Feature 7: Day of Week
        features.append(date.weekday())

        # Feature 8: # Question marks in Body, bool in body
        features.append(email['body'].count('?'))

        # Feature 9: # Question marks in Subject, bool in subject
        features.append(email['subject'].count('?'))

        # NEW FEATURES

        # boolean: presence of ? in body / header
        features.append(1 if '?' in email['body'] else 0)
        features.append(1 if '?' in email['subject'] else 0)

        # Feature 12-13: "RESPONSE NEEDED" in subject or body
        keywords = ['response', 'please', 'can', 'urgent', 'important', 'need']
        for keyword in keywords:
            stemmed_keyword = stemmer.stem(keyword, 0, len(keyword) - 1)
            features.append(1 if stemmed_keyword in lower_case_subject else 0)
            features.append(1 if stemmed_keyword in lower_case_body else 0)

        x_vals.append(features)
        y_vals.append(label)

    X = np.array(x_vals)
    Y = np.array(y_vals)
    return X, Y
コード例 #44
0
ファイル: search.py プロジェクト: nelly-hateva/torchwood
        for document, count in pairwise(documents):
            dictionary[document] = int(count)
        count_term.update({entry[0]: int(entry[1])})
        index.update({entry[0]: dictionary})

with open(lengths_file, "r") as lengths:
    documents_lengths = map(lambda line: line.strip(), lengths.readlines())
    documents_lengths = [int(length) for length in documents_lengths]
    corpus_length = documents_lengths[len(documents_lengths) - 1]
    del documents_lengths[len(documents_lengths) - 1]


for term in query:
    term = term.translate(None, string.punctuation)
    if term not in stopwords:
        inner_query.append(p.stem(term.lower(), 0, len(term) - 1))


documents_count = len(documents_lengths)
query_terms_count = len(inner_query)
heap = []
probabilities = [[1 for x in xrange(documents_count)] for x in xrange(query_terms_count)]
results = [1 for x in xrange(documents_count)]
for i, term in enumerate(inner_query):
    try:
        term_collection_frequency = count_term[term] / corpus_length
    except KeyError:  # the term doesn't appear in the collection
        continue
    for document in xrange(1, documents_count + 1):
        probabilities[i][document - 1] = lambda_ * (index[term][str(document)] / documents_lengths[document - 1] + term_collection_frequency)