Ejemplo n.º 1
0
def run_search(dict_file, postings_file, queries_file, results_file):
    """
    using the given dictionary file and postings file,
    perform searching on the given queries file and output the results to a file
    """
    print('running search on the queries...')

    dictionary = dict()
    lengths = dict()
    stemmer = stem.PorterStemmer()

    #Open dictionary in memory
    with open(dict_file, "rb") as dictionary_f:
        dictionary = pickle.load(dictionary_f)

    with open(os.path.join(os.getcwd(), "lengths.txt"), "rb") as lengths_f:
        lengths = pickle.load(lengths_f)

    #Open and read each line of the queries file
    try:
        fd = open(queries_file, 'r', encoding="utf8")
        line = fd.readline()
    except:
        error_opening_file(queries_file)
        sys.exit(2)

    #Erase the contents of the file
    output_file = open(results_file, "w")
    output_file.close()
    #Open file to append lines
    output_file = open(results_file, "a")

    #Evaluate each line or query
    while line:

        #If its blank just write nothing
        if (line == " " or line == "\n" or line == "\t"):
            output_file.write('\n')
            line = fd.readline()
            continue

        #Tokenize the query
        tokens = nltk.word_tokenize(line)
        scores = defaultdict(lambda: 0)

        # Creating empty heap
        heap = []
        heapify(heap)

        #COSINE SCORE
        #For each query term t
        stemmed_tokens = list()
        unique_tokens = set()
        for token in tokens:
            stemmed_tokens.append(stemmer.stem(token.lower()))
            unique_tokens.add(stemmer.stem(token.lower()))

        for token in unique_tokens:

            docFreq_pointer = dictionary.get(token, -1)

            if (docFreq_pointer == -1):
                continue

            # get the document_frequency for the token
            document_frequency = docFreq_pointer[0]

            #read the posting lists, only open the file in this line
            postings_f = open(postings_file, "rb")
            #Move to the position in the file where docFreq_pointer[1] = pointer
            postings_f.seek(docFreq_pointer[1])
            #Only read the object at that position
            token_postings_list = pickle.load(postings_f)
            #Close file
            postings_f.close()
            #print("token postings list:")
            #print(token_postings_list)

            for docID_termF in token_postings_list:
                doc_vector = lengths[docID_termF[0]]
                query_idf = (len(lengths) + 1) / (document_frequency + 1)
                """
                print("current token of query:")
                print(token)

                print("term frequency in query:")
                print(stemmed_tokens.count(token))

                print("weight of term in doc vector:")
                print(doc_vector[token])

                print("tf of term in query:")
                print(1 + math.log(stemmed_tokens.count(token), 10))

                print("idf division:")
                print(query_idf)

                print("idf of term in query:")
                print(math.log((query_idf),10))

                print("weight of the term in query:")
                print(((1 + math.log(stemmed_tokens.count(token), 10)) * math.log((query_idf),10)))
                """

                scores[docID_termF[0]] += (doc_vector[token]) * (
                    (1 + math.log(stemmed_tokens.count(token), 10)) * math.log(
                        (query_idf), 10))
                heappush(heap, (-1 * scores[docID_termF[0]], docID_termF[0]))

        maxTen = heap[:10]
        result = []
        for cosineSim_docId in maxTen:
            result.append(cosineSim_docId[1])
        #Write the result with the specified format
        output_file.write(' '.join(map(str, result)))
        #Prepare new line
        output_file.write("\n")

        line = fd.readline()

    output_file.close
    fd.close
Ejemplo n.º 2
0
def stem_word(word):
    #stemming of words
    new_word = word.replace(",", "")
    new_word = new_word.replace("\'", "")
    stemmer = stem.PorterStemmer()
    return stemmer.stem(new_word)
Ejemplo n.º 3
0
 def __init__(self):
     self.stemmer = stem.PorterStemmer()
     self.validreg = re.compile(r'^[-=!@#$%^&*()_+|;";,.<>/?]+$')
     self.splitreg = re.compile(r'\s|,|\.|\(|\)|\'|/|\'|\[|\]|-')
Ejemplo n.º 4
0
def train_knn(trainlines, model_fname):
    # worddict [class][filenum][words]
    worddict = defaultdict(lambda: defaultdict(dict))
    gtruth_dict = defaultdict(int)
    idf_dict = defaultdict(float)
    chicount_dict = defaultdict(lambda: defaultdict(int))
    fileindex = 0
    # counting the instances of variable
    class_count = defaultdict(int)
    #chifeat_words = ["_RARE_"]
    chifeat_words = []
    total_docs = len(trainlines)
    uniquewords = set()
    stemmer = stem.PorterStemmer()
    stopwords = [
        'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
        'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his',
        'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself',
        'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
        'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
        'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having',
        'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
        'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with',
        'about', 'between', 'into', 'through', 'during', 'before', 'after',
        'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'over',
        'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when',
        'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
        'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than',
        'too', 'very', 's', 'can', 'will', 'just', 'should', 'now', 'movie',
        'would', 'thing', 'film', 'cinema', 'movie', 'movies', 'cinemas', 'tv',
        'documentary', 'y'
    ]
    '''
  stopwords.extend(string.punctuation)
  stopwords = set(stopwords)
  '''
    tokenizer = RegexpTokenizer(r'\w+')
    exp_allnums = re.compile('^[-./]*[0-9][-0-9.,:/]+$')
    exp_wordnums = re.compile('^[A-Za-z]+[0-9]+[-/0-9.A-Za-z]*$')
    exp_numwords = re.compile('^[0-9]?[0-9]+[-./A-Za-z]+$')

    for line in trainlines:
        line = line.strip().lower()
        fileindex = fileindex + 1
        gtruth_dict[fileindex] = int(line[0])
        class_count[int(line[0])] += 1
        #tokenlist = word_tokenize(commentwords)
        commentline = line[3:-1]
        commentline = commentline.replace("it's", "it is")
        commentline = commentline.replace("won't", "will not")
        commentline = commentline.replace("can't", "cannot")
        commentline = commentline.replace("n't", " not")
        commentline = commentline.replace("'ll", " will")
        commentwords = tokenizer.tokenize(commentline)
        # Numeric words
        allnumkeys = set([
            m.group(0) for word in commentwords
            for m in [exp_allnums.search(word)] if m
        ])
        # identify exclusive alphanumeric
        wordnumset = set([
            m.group(0) for word in commentwords
            for m in [exp_wordnums.search(word)] if m
        ])
        # identify exclusive numericalpha
        numwordset = set([
            m.group(0) for word in commentwords
            for m in [exp_numwords.search(word)] if m
        ])
        # removing the punctuations and stop words

        commentwords = [
            stemmer.stem(x) for x in commentwords
            if stemmer.stem(x) not in set(stopwords) | allnumkeys
            | set(wordnumset) | set(numwordset)
        ]

        wordset = set(commentwords)
        # for single occurance of word increment
        for word in wordset:
            idf_dict[word] += 1
        uniquewords.update(wordset)
        worddict[fileindex] = commentwords

        # getting the chisquare_dict
        for word in commentwords:
            chicount_dict[word][int(line[0])] += 1
    # Setting IDF for each doc.
    for key, value in idf_dict.iteritems():
        idf_dict[key] = float(total_docs) / value

    # calculating chi-square value for each word.
    for word in uniquewords:
        # subscript (term, class)
        n11 = chicount_dict[word][1] + 1
        n10 = chicount_dict[word][0] + 1
        n01 = class_count[1] - n11
        n00 = class_count[0] - n10
        num = (n11 + n10 + n01 + n00) * (n11 * n00 - n10 * n01) * (n11 * n00 -
                                                                   n10 * n01)
        den = (n11 + n01) * (n11 + n10) * (n10 + n00) * (n01 + n00)
        chivalue = float(num) / den
        if chivalue > 0:  # other values are
            # 10.83 -- 0.001
            # 7.88  -- 0.005
            # 6.63  -- 0.01
            # 3.84  -- 0.05
            # 2.71  -- 0.1
            chifeat_words.append(word)
    #print chifeat_words
    print len(chifeat_words), "\n"

    uniquewords = chifeat_words
    # enumeration of unique words as enumeration is not existing in python
    feat_len = len(uniquewords)
    uniquewords = list(uniquewords)
    enumwords_dict = defaultdict(int)
    iter = 0
    for word in uniquewords:
        enumwords_dict[word] = iter
        iter += 1
    #initialising centroid matrix
    centroid_mat = defaultdict(list)
    centroid_mat[0] = [0] * feat_len
    centroid_mat[1] = [0] * feat_len

    for findex in range(1, len(trainlines) + 1):
        featvect = [0] * feat_len
        words = worddict[findex]
        for word in words:
            if word in uniquewords:
                #featvect[enumwords_dict[word]] += idf_dict[word]/len(words) # normalising length of sentence.
                featvect[enumwords_dict[word]] += float(
                    1)  #/len(words) # normalising length of sentence.
            else:
                featvect[enumwords_dict["_RARE_"]] += float(
                    1)  #/len(words) # normalising length of sentence.
            # replace this value 1 with frequency or tf/idf for better results.
        centroid_mat[gtruth_dict[findex]] = map(
            add, centroid_mat[gtruth_dict[findex]], featvect)
    for cl in [0, 1]:
        centroid_mat[cl] = map(lambda x: x / class_count[cl], centroid_mat[cl])
    fopen = open(model_fname, "wb")
    pickle.dump([centroid_mat, enumwords_dict, idf_dict], fopen)

    fopen.close()
Ejemplo n.º 5
0
 def __init__(self):
     self._stemmer = stem.PorterStemmer()
Ejemplo n.º 6
0
def regex_str(sentence):
	tokenizer = RegexpTokenizer(r'\w+')
	stemmer = stem.PorterStemmer()
	words = tokenizer.tokenize(sentence)
	return [stemmer.stem(word) for word in words if not stopwords_exsits(word)]
Ejemplo n.º 7
0
def extract_features(sentence):
    words = sentence.split()
    stemmer = stem.PorterStemmer()
    result = [stemmer.stem(word) for word in words if not check(word)]
    return ' '.join(result)
Ejemplo n.º 8
0
def lower_and_stem(s):
    return stem.PorterStemmer().stem_word(s.lower())
Ejemplo n.º 9
0
def preprocess(trainlines):
    worddict = defaultdict(lambda: defaultdict(int))
    chisquare_dict = defaultdict(float)

    chicount_dict = defaultdict(lambda: defaultdict(int))
    gtruth_dict = defaultdict(int)
    idf_dict = defaultdict(float)
    index = 0
    stemmer = stem.PorterStemmer()
    total_docs = len(trainlines)
    stopwords = [
        'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
        'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his',
        'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself',
        'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
        'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
        'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having',
        'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
        'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with',
        'about', 'between', 'into', 'through', 'during', 'before', 'after',
        'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'over',
        'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when',
        'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
        'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than',
        'too', 'very', 's', 'can', 'will', 'just', 'should', 'now', 'movie',
        'would', 'thing', 'film', 'cinema', 'movie', 'movies', 'cinemas', 'tv',
        'documentary', 'y'
    ]
    stopwords = set(stopwords)
    tokenizer = RegexpTokenizer(r'\w+')

    # Regular Expressions to remove alpha numerics
    exp_allnums = re.compile('^[-./]*[0-9][-0-9.,:/]+$')
    exp_wordnums = re.compile('^[A-Za-z]+[0-9]+[-/0-9.A-Za-z]*$')
    exp_numwords = re.compile('^[0-9]?[0-9]+[-./A-Za-z]+$')

    for line in trainlines:
        line = line.strip().lower()
        index = index + 1
        gtruth_dict[index] = int(line[0])

        #tokenlist = word_tokenize(commentwords)
        commentline = line[3:-1]
        commentline = commentline.replace("it's", "it is")
        commentline = commentline.replace("won't", "will not")
        commentline = commentline.replace("can't", "cannot")
        commentline = commentline.replace("n't", " not")
        commentline = commentline.replace("'ll", " will")
        commentwords = tokenizer.tokenize(commentline)
        commentwords = [
            stemmer.stem(x) for x in commentwords
            if stemmer.stem(x) not in stopwords
        ]

        wordset = list(set(commentwords))
        # counting freq of docs with the "word"
        for word in wordset:
            idf_dict[word] += 1
        # adding to dictionary
        for word in commentwords:
            worddict[int(line[0])][word] += 1
            chicount_dict[word][int(line[0])] += 1

    # removing numerics as key words
    allnumkeys = [
        m.group(0)
        for word in set(worddict[0].iterkeys()) | set(worddict[1].iterkeys())
        for m in [exp_allnums.search(word)] if m
    ]
    # identify exclusive alphanumeric
    wordnumset = [
        m.group(0)
        for word in set(worddict[0].iterkeys()) | set(worddict[1].iterkeys())
        for m in [exp_wordnums.search(word)] if m
    ]
    # identify exclusive numericalpha
    numwordset = [
        m.group(0)
        for word in set(worddict[0].iterkeys()) | set(worddict[1].iterkeys())
        for m in [exp_numwords.search(word)] if m
    ]

    # removing all numerics and alpha numerics.
    allremovablekeys = allnumkeys + numwordset + wordnumset
    for numstr in allremovablekeys:
        worddict[1].pop(numstr, None)
        worddict[0].pop(numstr, None)
        chicount_dict[1].pop(numstr, None)
        chicount_dict[0].pop(numstr, None)
        idf_dict.pop(numstr, None)

    # Setting IDF now
    for key, value in idf_dict.iteritems():
        idf_dict[key] = float(total_docs) / value

    return [worddict, gtruth_dict, idf_dict, chicount_dict]
Ejemplo n.º 10
0
def nlp52(sentences: list) -> list:
    stemmer = stem.PorterStemmer()
    return [
        '\n'.join([stemmer.stem(word) for word in sentence.split('\n')])
        for sentence in sentences
    ]
Ejemplo n.º 11
0
 def __init__(self, filename):
     self.filename = filename
     self.stemmer = stem.PorterStemmer()
Ejemplo n.º 12
0
import gensim
from gensim.models.doc2vec import *
#import cPickle as pickle
import numpy
#from collections import namedtuple
import nltk
import nltk.corpus as nc
import nltk.stem as nsl
import re
import scipy.io as sio
#import numpy as np
#from sklearn.manifold import TSNE

nltk.download()
stops = set(nc.stopwords.words("english"))
st = nsl.PorterStemmer()


def rev_to_words(rev_line):
    ## from stopping to stemming
    #1. remove non_letters
    letters_only = re.sub("[^a-zA-Z]"," ", rev_line)
    #2. convert to lower case, and split
    line = letters_only.lower()
    #3. stopword remove!
   # meaningful_words = [w for w in words if not w in stps]
    #4. return the result
    return(line)
    
#f  = open('books_processed_positive.txt','rb')
#text = pickle.load(f)
Ejemplo n.º 13
0
Archivo: ex52.py Proyecto: hysy/nlp100
def stemWordsList(words_list):
    return [[(word, stem.PorterStemmer().stem(word)) for word in words] for words in words_list ]
def get_stat_dict(nametxt_dict, test=False):

    if test:
        pp_dir = "test/partisan_phrases/"
    else:
        pp_dir = "src/data/init/partisan_phrases/"
    pp_txts = os.listdir(pp_dir)
    score_dict = {}
    for i in pp_txts:
        with open(pp_dir + i) as curtxt:
            for line in curtxt.readlines()[1:]:
                splt = line.split("|")
                score_dict[splt[0]] = float(splt[1].strip())

    nltk.download("wordnet")
    nltk.download("stopwords")

    stpwrds = stopwords.words("english")
    porter = stem.PorterStemmer()

    def preproc_strn(strn):
        # Lowercase, remove digits and doublespaces
        curstr = strn.lower().translate(
            str.maketrans('', '', string.punctuation))
        curstr = re.sub(r'[0-9]+', '', curstr)
        curstr = re.sub(r'\n', ' ', curstr)
        curstr = re.sub(r'  +', ' ', curstr)
        plst = []
        for word in curstr.split():
            # Check for stopwords
            if word not in stpwrds:
                # Porter stem the word
                pword = porter.stem(word)
                plst.append(pword)
        numwords = len(plst)
        curstr = ' '.join(plst)
        return (curstr, numwords)

    def string_score(strn, score_dict):
        # Pre-process, return the processed string and the number of words
        curstr, numwords = preproc_strn(strn)

        # Absolute bias sum
        absscore = 0
        # Bias sum
        sumscore = 0
        # Total number of occurences of phrases from G&S
        totphrs = 0

        # Dictionary of top 10 phrase counts
        counts_dict = {}

        for key, value in score_dict.items():

            numoccurs = curstr.count(key)
            totphrs += numoccurs
            counts_dict[key] = (numoccurs, value)
            curscore = numoccurs * value
            absscore += abs(curscore)
            sumscore += curscore

        counts_list = sorted(counts_dict.items(),
                             key=lambda item: item[1],
                             reverse=True)[:10]
        return [absscore, sumscore, numwords, counts_list, totphrs]

    namestat_dict = {}

    for name, txt in nametxt_dict.items():
        namestat_dict[name] = string_score(txt, score_dict)

    for name, stat in namestat_dict.items():
        dispcnt = 1
        procname = preproc_strn(name)[0]
        is_intitle = False
        for phr, freq in stat[3]:
            if phr in procname:
                is_intitle = True
            dispcnt += 1

        namestat_dict[name].append(is_intitle)
    return namestat_dict
Ejemplo n.º 15
0
def stem_make():
    stemmer = stem.PorterStemmer()
    for word in word_make():
        yield word + "\t" + stemmer.stem(word)
Ejemplo n.º 16
0
def stemming_porter():
    for word_stem in separate_word():
        stemmer = stem.PorterStemmer()
        yield (word_stem, stemmer.stem(word_stem))
Ejemplo n.º 17
0
from knock71 import stop
from collections import defaultdict
from nltk import stem

stemming = stem.PorterStemmer()

feature_dict = defaultdict(int)
for line in open('sentiment.txt'):
    word_list = line.strip('\n').split()
    word_list.pop(0)
    for word in word_list:
        word = stemming.stem(word)
        if stop(word) == False:
            feature_dict[word] += 1

for word, freq in sorted(feature_dict.items()):
    print(word + '\t' + str(freq))
        
Ejemplo n.º 18
0
	def normalize(self, s, stemmer=stem.PorterStemmer()):
		words = tokenize.wordpunct_tokenize(s.lower().strip())
		return ' '.join([stemmer.stem(w) for w in words])
Ejemplo n.º 19
0
def search(query, docTermIndex):
    print('\nRetrieving documents for query \'{}\'\n'.format(query))
    qlist = query.strip().split()
    #Remove stop words
    modified = [
        term for term in qlist if term not in stopwords.words('english')
    ]

    EXPAND = False
    '''Expansion
    EXPAND = True
    for term in modified:
        syns = wordnet.synsets(term) 
        for i in range(len(syns)):
            #print(syns[i].lemmas()[0].name())
            new.append(syns[i].lemmas()[0].name())
    ''' ''''''
    if EXPAND == False:
        new = modified
    before_stem = np.unique(new)
    ps = stem.PorterStemmer()
    after_stem = [ps.stem(word) for word in before_stem]
    mQuery = np.unique(after_stem)
    mQuery

    #read potentially relevant docs into matrix using tf*idf weights
    qMatrix = pd.DataFrame(np.zeros((0, len(mQuery))), columns=mQuery)

    for term in mQuery:
        if term in docTermIndex.keys():
            termInfo = docTermIndex.get(term)
            for occurence in termInfo.occList:
                if occurence.docID not in qMatrix.index:
                    toAppend = pd.Series(np.zeros(len(qMatrix.columns)),
                                         index=qMatrix.columns,
                                         name=occurence.docID)
                    toAppend[term] = occurence.count * termInfo.idf
                    qMatrix = qMatrix.append(toAppend)
                else:
                    qMatrix.loc[occurence.docID,
                                term] = occurence.count * termInfo.idf

    #compute tfxidf vector of query
    #print(qMatrix.columns)
    q_vect = [docTermIndex.get(term).idf for term in qMatrix.columns]

    #Get cosine similarities for query
    matrix_norm = np.array(
        [np.linalg.norm(qMatrix.iloc[i]) for i in range(len(qMatrix))])
    q_norm = np.linalg.norm(q_vect)
    sims = np.dot(qMatrix, q_vect) / (matrix_norm * q_norm)
    dists = 1 - sims
    idx = np.argsort(dists)

    user_docs = qMatrix.iloc[idx[:10]].index
    classes = pd.read_csv('classes.csv', index_col=1)

    for i, path in enumerate(user_docs):
        parts = path.split('/')
        group = parts[1]
        file = parts[2]
        print('----{}: File {} in folder {}-----\n'.format(i + 1, file, group))
        with open(path, 'r', errors='ignore') as myfile:
            data = myfile.read()
            art = data
            ind = art.find('\n\n')
            art = art[ind + 2:]
            #If article(and not post), re-index again to get rid of tags
            if art[0:10].find('archive') != -1:
                ind = art.find('\n\n')
                art = art[ind + 2:]
            mid = len(art) // 2
            midmid = mid // 2
            print('---------------------------------------------\n')
            print(art[mid:mid + 200])
            print('---------------------------------------------\n')
    return user_docs
Ejemplo n.º 20
0
def stemmer(text):
    stemmer = stem.PorterStemmer("NLTK_EXTENSIONS")
    stemmed_tokens = [stemmer.stem(token) for token in text.split()]
    return stemmed_tokens
Ejemplo n.º 21
0
class PorterStemmer(BaseNormalizer):
    name = 'Porter Stemmer'
    normalizer = stem.PorterStemmer().stem
Ejemplo n.º 22
0
def get_all_history_stats(test = False):
    
    # Setup for string for cleaning
    stpwrds = stopwords.words("english")
    porter = stem.PorterStemmer()
    nltk.download("wordnet")
    nltk.download("stopwords")
    
    # Base directory for saving/processing article histories
    if test:
        xmls_base = "test/temp/wiki_xmls/"
    else:
        xmls_base = "src/data/temp/wiki_xmls/"
    
    if not os.path.exists(xmls_base):
        os.makedirs(xmls_base)
        
    # Base directory for saving resdicts
    rd_base = "src/data/temp/resdicts/"
    if not os.path.exists(rd_base):
        os.makedirs(rd_base)
    
    # Get and split anames into chunks of 20 for chunk-wise processing
    anames = retrieve_anames()
    alst = [anames[i:i + 20] for i in range(0, len(anames), 20)]
    
    # Load in the score dictionary
    if test:
        pp_dir = "test/partisan_phrases/"
    else:
        pp_dir = "src/data/init/partisan_phrases/"

    pp_txts = os.listdir(pp_dir)
    score_dict = {}
    for i in pp_txts:
        with open(pp_dir + i) as curtxt:
            for line in curtxt.readlines()[1:]:
                splt = line.split("|")
                score_dict[splt[0]] = float(splt[1].strip())
    
    # Helper for cleaning strings
    def preproc_strn(strn):
        # Lowercase, remove digits and doublespaces
        curstr = strn.lower().translate(str.maketrans('', '', string.punctuation))
        curstr = re.sub(r'[0-9]+', '', curstr)
        curstr = re.sub(r'\n', ' ', curstr)
        curstr = re.sub(r'  +', ' ', curstr)
        plst = []
        for word in curstr.split():
            # Check for stopwords
            if word not in stpwrds:
                # Porter stem the word
                pword = porter.stem(word)
                plst.append(pword)
        numwords = len(plst)
        curstr = ' '.join(plst)
        return (curstr, numwords)
        

    def get_art_hists(for_hist):
        for_hist_und = ["_".join(i.split()) for i in for_hist]
        exp_base = "https://en.wikipedia.org/w/index.php?title=Special:Export&pages="
        exp_end = "&history=1&action=submit"
        

        

        for tit in for_hist_und:
            url = exp_base + tit + exp_end
            try:
                resp = requests.get(url)
            except Exception as e:
                try:
                    time.sleep(10)
                    resp = requests.get(url)
                except Exception as e:
                    print(tit + " did not get processed")

            with open(xmls_base + tit + ".xml", mode = "wb") as wfile:
                wfile.write(resp.content)

            resp.close()
            
            
    def get_hist_stats(rdname):
        xmls_list = [x for x in os.listdir(xmls_base) if ".xml" in x]

        resdict = {}
        for fn in xmls_list:

            # This block is for fixing broken xmls with no closing tags
            try:
                tree = ET.parse(xmls_base + fn)
            except Exception as e:
                with open(xmls_base + fn, "a") as app:
                    app.write("  </page>")
                    app.write("</mediawiki>")
                tree = ET.parse(xmls_base + fn)

            # Set up the tree and the list of results for the current article
            root = tree.getroot().find("{http://www.mediawiki.org/xml/export-0.10/}page")
            revlist = []

            for rev in root.findall("{http://www.mediawiki.org/xml/export-0.10/}revision"):
                # The dictionary for each revision
                curdict = {}

                curdict["time"] = rev.find("{http://www.mediawiki.org/xml/export-0.10/}timestamp").text
                txt = rev.find("{http://www.mediawiki.org/xml/export-0.10/}text").text

                if not txt is None:
                    curdict["text"] = txt
                else:
                    curdict["text"] = ""

                comm = rev.find("{http://www.mediawiki.org/xml/export-0.10/}comment")
                if not comm is None:
                    curdict["comm"] = comm.text
                else:
                    curdict["comm"] = ""

                cont = rev.find("{http://www.mediawiki.org/xml/export-0.10/}contributor")
                user = cont.find("{http://www.mediawiki.org/xml/export-0.10/}username")
                if not user is None:
                    curdict["user"] = user.text
                else:
                    curdict["user"] = cont.find("{http://www.mediawiki.org/xml/export-0.10/}ip").text

                revlist.append(curdict)

            resdict[fn[:-4]] = revlist
            
        cnt = 0

        # Populate resdict with stats
        for name, revl in resdict.items():
            prevr = db.bigram(preproc_strn(revl[0]["text"])[0])
            for rev in revl:
#                 if cnt % 1000 == 1:
#                     print(cnt)
                cnt += 1
                curr = db.bigram(preproc_strn(rev["text"])[0])
                diffs = db.unique_items(prevr, curr)
                rem, add = diffs

                # Trying to get the following output: [absscore, sumscore, numwords, counts_list, totphrs]
                rem_abs = 0
                add_abs = 0
                rem_sum = 0
                add_sum = 0
                rem_num = len(rem)
                add_num = len(add)
        #         add_counts = {}
        #         rem_counts = {}
                add_phrs = 0
                rem_phrs = 0

                for bigr in rem:
                    if bigr in score_dict.keys():
                        rem_abs += abs(score_dict[bigr])
                        rem_sum += score_dict[bigr]
                        rem_phrs += 1

                for bigr in add:  
                    if bigr in score_dict.keys():
                        add_abs += abs(score_dict[bigr])
                        add_sum += score_dict[bigr]
                        add_phrs += 1

                rev["rem"] = rem
                rev["add"] = add
                rev["rem_abs"] = rem_abs
                rev["add_abs"] = add_abs
                rev["rem_sum"] = rem_sum
                rev["add_sum"] = add_sum
                rev["rem_num"] = rem_num
                rev["add_num"] = add_num
                rev["rem_phrs"] = rem_phrs
                rev["add_phrs"] = add_phrs

                del rev["text"]
                prevr = curr

        
        with open(rd_base + rdname + ".json", "w") as outfile:  
            json.dump(resdict, outfile) 
            
        
    def del_art_hists():
        files = glob.glob(xmls_base + "*")
        for f in files:
            os.remove(f)
            
            
    if not test:
        for ind, hst in enumerate(alst):
            # For each chunk of 20 scrapes, processes, and deletes the articles
            get_art_hists(hst)
            get_hist_stats("rd" + str(ind+1))
            del_art_hists()
    else:
        get_hist_stats("testrd")
Ejemplo n.º 23
0
Created on Tue Apr  4 17:08:23 2017

@author: konodera
"""
import os
print(
    """#==============================================================================
# START !!! {} PID: {}
#==============================================================================
""".format(__file__, os.getpid()))

import utils
import numpy as np
import pandas as pd
from nltk import stem
pt = stem.PorterStemmer().stem

from gensim.models import Doc2Vec
d2v = Doc2Vec.load('../nlp_source/d2v/enwiki_dbow/doc2vec.bin')
from gensim.models import KeyedVectors
w2v = KeyedVectors.load_word2vec_format(
    '../nlp_source/w2v/GoogleNews-vectors-negative300.bin.gz', binary=True)

import gc

train, test = utils.load(7, 1)

train_, test_ = utils.load(2)
train = pd.merge(train, train_, on='id', how='left')
test = pd.merge(test, test_, on='test_id', how='left')
del train_, test_
Ejemplo n.º 24
0
def stemming(word):
    stemmer = stem.PorterStemmer()
    try:
        return stemmer.stem(word)
    except:
        return word
Ejemplo n.º 25
0
* filtering stopwords
* stemming
* lemmatization
* custom string similarity metric based on strig-edit distance
"""
from nltk import stem
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

from DAS.keywordsearch.config import USEFUL_STOPWORDS
from DAS.keywordsearch.config import get_setting
from DAS.keywordsearch.entity_matchers.string_dist_levenstein \
    import levenshtein_normalized as levenshtein_norm
from DAS.keywordsearch.utils import memo

STEMMER = stem.PorterStemmer()
LMTZR = WordNetLemmatizer()
EN_STOPWORDS = stopwords.words('english')
EN_STOPWORDS_SET = set(EN_STOPWORDS)
USEFUL_STOPWORDS_SET = set(USEFUL_STOPWORDS)

lemmatize = memo(LMTZR.lemmatize)
lemmatize.__doc__ = "cached version of lmtzr.lemmatize"

getstem = memo(STEMMER.stem)
getstem.__doc__ = "cached version of PorterStemmer() stem"

# load the lemmatization DB now
lemmatize("dataset")

Ejemplo n.º 26
0
 def normalize_word(word):
     word = word.lower()
     stemmer = stem.PorterStemmer()
     return stemmer.stem(word)
Ejemplo n.º 27
0
import pandas as pd
import re
import nltk
from nltk import stem
import pickle

# 数据文件
filename = "labeledTrainData.tsv"
# 停用词文件
stopfile = "stop_words.txt"
stemmer = stem.PorterStemmer()  # nltk.stem词干提取对象初始化


# 文本预处理1,字符串文本, 返回list格式的单词
def text_pre_process1(text):
    text_1 = re.sub(r"</?[^>]*>|\\|n*'[\w]*|[^(\w|\s)]", ' ',
                    text)  # 去除html标签,'\', 英文缩写, 非英文字符
    text_2 = nltk.word_tokenize(text_1)  # nltk分词
    text_3 = []
    for word in text_2:
        text_3.append(stemmer.stem(word))  # nltk.stem词干提取
    return text_3


# 文本处理2,处理停用词,单词list,停用词list
def stop_words_process2(word_list, stop_list):
    word_clean = []
    for word in word_list:
        if word.lower() in stop_list:
            continue
        word_clean.append(word)
Ejemplo n.º 28
0
    dictionary.save(save_direc)
    return dictionary


docs_direc = 'data/raw/ohsumed_docs_stemmed.pickle'
q_direc = 'data/raw/querry_content_stemmed.pickle'
dct_direc = 'data/dcts/dict'

print 'Loading Data'
with open(docs_direc, 'rb') as f:
    d = pickle.load(f)

print 'Loading querries'
with open(q_direc, 'rb') as f:
    q = pickle.load(f)

print 'Getting docs'
docs = get_docs(d, q)
print len(docs)

stemmer = ns.PorterStemmer()
stop_list = []
for w in stop_words:
    stop_list.append(stemmer.stem(w))

for i in range(1, 6):
    print 'Making dictionary for i =', str(i)
    direc = dct_direc + str(i)
    dct = make_dictionary(docs, stop_list, i, direc)
    print dct
Ejemplo n.º 29
0
# -*- coding: utf-8 -

from nltk import stem
from collections import defaultdict
from knock71 import getstopword

stoplist = getstopword()
stemmer = stem.PorterStemmer()

def create_features(input_file):
    phi = defaultdict(int)
    for line in open(input_file, "r"):
        words = line.strip("\n").split()
        for word in words[1:]:
            word = stemmer.stem(word)
            if word in stoplist:
                pass
            else:
                phi[word] += 1
    return phi

def create_feature_vector(line, features_dict):
    featureline_list = list()
    feature_vector = list()
    words = line.strip("\n").split()
    for word in words[1:]:
        word = stemmer.stem(word)
        if word in stoplist:
            pass
        else:
            featureline_list.append(word)
Ejemplo n.º 30
0
 def test_get_stems(self):
     porterStem = stem.PorterStemmer()
     self.assertEqual(get_stems('during', porterStem), 'dure')