Example #1
0
    def removeURLsCleanStem(self):	# preprocessor
        '''
        Remove URLs and punct, lower case everything,
        Convert '-/-' to 'mut_mut',
        Keep tokens that start w/ letter or _ and are 2 or more chars.
        Stem,
        Replace \n with spaces
        '''
        # This is currently the only preprocessor that uses a stemmer.
        # Would be clearer to import and instantiate one stemmer above,
        # BUT that requires nltk (via anaconda) to be installed on each
        # server we use. This is currently not installed on our linux servers
        # By importing here, we can use RefSample in situations where we don't
        # call this preprocessor, and it will work on our current server setup.
        global stemmer
        if not stemmer:
            import nltk.stem.snowball as nltk
            stemmer = nltk.EnglishStemmer()
        #------
        def _removeURLsCleanStem(text):
            output = ''
            for s in urls_re.split(text): # split and remove URLs
                s = featureTransform.transformText(s).lower()
                for m in token_re.finditer(s):
                    output += " " + stemmer.stem(m.group())
            return  output
        #------

        self.setTitle( _removeURLsCleanStem( self.getTitle()) )
        self.setAbstract( _removeURLsCleanStem( self.getAbstract()) )
        self.setExtractedText( _removeURLsCleanStem( self.getExtractedText()) )
        return self
Example #2
0
def stem( token ):
	token_stem = token.decode( 'utf-8', 'ignore' )
	try: # note: stemming includes lower casing
		token_stem = snowball.EnglishStemmer().stem( token_stem.lower() )
	except:
		pass
	return token_stem
Example #3
0
def get_stemmer(stemmer_type):
    if stemmer_type == 'lancaster':
        stemmer = LancasterStemmer()
    elif stemmer_type == 'porter':
        stemmer = PorterStemmer()
    else:
        stemmer = snowball.EnglishStemmer()
    return stemmer
Example #4
0
 def __init__(self, language='english'):
     self.documents = None
     self.training_queries = None
     self.validation_queries = None
     self.test_queries = None
     self.language = language
     self.stemmer = snowball.EnglishStemmer()
     self.stop_words = set(stopwords.words('english'))
Example #5
0
def stemmed_count(words):
    stemmed_to_variant_counter = collections.defaultdict(collections.Counter)
    stemmer = snowball.EnglishStemmer()

    for word in words:
        normalized_word = stemmer.stem(word).lower()
        stemmed_to_variant_counter[normalized_word][word] += 1

    return flatten_dict_of_counts(stemmed_to_variant_counter)
Example #6
0
 def __init__(self, k=5, language='english'):
     """
     documents: pandas.DataFrame, [docid] => text string
     k: number of folders for the queries
     """
     self.documents = None
     self.k = k
     self.language = language
     self.stemmer = snowball.EnglishStemmer()
     self.stop_words = set(stopwords.words('english'))
def word_count():
    """
	Get the word count of each word in the corpus (to be used for calculating vocabulary richness).
	Note that this uses the results of the SENNA tagger for POS, not the Stanford Tagger.
	"""
    count = {}  # map words to counts
    stemmer = snowball.EnglishStemmer(
    )  # for stemming words so that different forms aren't counted separately
    stoplist = set(
        """( ) : , . { } [ ] ; . ' " ! ? @ # $ % * \ + what at "n't" if for a an on of the and to from in by or either neither so where there those these this that it which who whose but be is have should would it about into 've he she them i i. w. hi him her my me you your their our we with 't 's then than when have not"""
        .split())

    dn = '/Volumes/Seagate Slim/litlab/tagged/'
    books = [fn for fn in os.listdir(dn) if fn.endswith('_tagged.txt')]

    for book in books:
        print book
        with open(dn + book) as f:
            sent_count = 0

            # skip the first three sentences (which is just meta-data)
            while sent_count < 3:
                line = f.readline().strip()

                if not line:
                    sent_count = sent_count + 1

            # each token is on a separate line
            for line in f:
                if line.strip():
                    token, pos = line.split()[:2]

                    parenth_replacements = {'(': '-LRB-', ')': '-RRB-'}
                    token = parenth_replacements.get(token, token)

                    # handle edge case that you forgot to handle earlier
                    if token.startswith('grey-'):
                        token = 'gray-' + token[5:]

                    # for colors or things that share a name with a color
                    is_color = token.startswith(
                        "__COLOR__") and token.endswith("__COLOR__")
                    tokens = (token[9:-9] if is_color else token).split("_")

                    for t in tokens:
                        t = stemmer.stem(t.lower(
                        ))  # lowercase and stem words for normalization
                        if t not in stoplist:  # exclude proper nouns, common words, and punctuation
                            if t not in count:
                                count[t] = 1
                            else:
                                count[t] = count[t] + 1

    pickle.dump(count, open('word_count', 'w'))
Example #8
0
def word_to_id(word, index):
    """Returns the id of the well-formatted version of 'word'. If 'word' is not in 'index', puts it in, and returns the new id."""
    word = word.decode('utf-8').lower()
    word = word.strip("\"")
    word = word.split("\'s")[0]
    esb = snowball.EnglishStemmer()
    word = esb.stem(word)
    try:
        return index[word]
    except KeyError:
        id_ = len(index)
        index[word] = id_
        return id_
Example #9
0
 def __init__(self):
     self.patt = re.compile(
         '(?u)\w+://[\w\./#]+|&\w+;|\s+-\s+|\s+:\s+|\s+|[^\w\'-]+')
     self.stopwords = []
     self.stemmer = snowball.EnglishStemmer()
     if 'stopwords.txt' in os.listdir():
         try:
             f = open('./stopwords.txt', encoding='utf-8')
             self.stopwords = eval(f.read())
         except:
             f.close()
             self.stopwords = stopwords.words('english')
     else:
         self.stopwords = stopwords.words('english')
Example #10
0
def get_all_words():
    if 'all_words' not in datavars and os.path.exists(FILENAME):
        with open(FILENAME, 'r') as datafile:
            datavars.update(json.load(datafile))
    if 'all_words' not in datavars:  # even after loading file
        stemmer = snowball.EnglishStemmer(ignore_stopwords=True)
        all_words = sorted(
            list(
                set(
                    stemmer.stem(w)
                    for w in corpus.words() if w.isalnum() and len(w) > 3) -
                set(stopwords.words())))
        datavars['all_words'] = all_words
        with open(FILENAME, 'w') as datafile:
            json.dump(datavars, datafile, indent=4)
    return datavars['all_words']
Example #11
0
def calc_vocab_score(tokens):
	"""
	Return a float that represents the richness of the vocabulary in the sentence. The more unique the 
	vocabulary, the higher the score (all scores are positive). The score is calculated as
	follows: for every word w in the sentence, score = \sum{ log( ||C|| / w_c ) } / len(tokens), where 
	w_c is the frequency in the corpus. This favours the use of rare words over penalization of the use
	of common words.
	"""
	stemmer = snowball.EnglishStemmer() # for stemming words so that different forms aren't counted separately
	score = 0
	num_used = 0	# number of tokens actually used in the calculation

	for token in tokens:
		token = stemmer.stem(token.lower())

		if token in corpus_count:	# this check automatically excludes stopwords and pronouns
			score = score + math.log( corpus_count['_TOTAL_'] ) - math.log( corpus_count[token] )
			num_used = num_used + 1

	return score / max(float(num_used), 1)
Example #12
0
def preprocess_text(tokens):

    wnl = nltk.WordNetLemmatizer()
    st = snowball.EnglishStemmer()

    # remove stopwords, punctuation, non-alphabetic characters; stem and lemmatize
    sentences = []
    for i in range(len(tokens)):
        words = [
            w for w in tokens[i] if w.lower() not in stopwords.words('english')
            and not w in string.punctuation
        ]
        words = [w for w in words if w.isalpha()]
        words = [st.stem(w) for w in words]
        # need to exclude pos from lemmatization
        for j in range(len(words)):
            if words[j] != 'pos':
                words[j] = wnl.lemmatize(words[j])
        sentences.append(words)

    return sentences
from nltk import pos_tag
from nltk.stem import snowball, WordNetLemmatizer, PorterStemmer, SnowballStemmer
stemmer = snowball.EnglishStemmer()
import re
import sys
from myModule import extraPrograms
from myModule.objects import bidict
import numpy as np
from nltk import wordnet
from nltk.tokenize import word_tokenize
from chemtok import ChemTokeniser
wordnet = wordnet.wordnet

nltk_pos = [
    'LS', 'TO', 'VBN', "''", 'WP', 'UH', 'VBG', 'JJ', 'VBZ', '--', 'VBP', 'NN',
    'DT', 'PRP', ':', 'WP$', 'NNPS', 'PRP$', 'WDT', '(', ')', '.', ',', '``',
    '$', 'RB', 'RBR', 'RBS', 'VBD', 'IN', 'FW', 'RP', 'JJR', 'JJS', 'PDT',
    'MD', 'VB', 'WRB', 'NNP', 'EX', 'NNS', 'SYM', 'CC', 'CD', 'POS', '#'
]
pos2idx = bidict({pos: i for (i, pos) in enumerate(nltk_pos)})


class chemtok:
    def __init__(self, kwargs={}):

        self.kwargs = kwargs

    def __call__(self, sents):

        return [
            ChemTokeniser(sent, **self.kwargs).getTokenStringList()
Example #14
0
def process_words(text,
                  language=None,
                  stem=True,
                  to_ascii=True,
                  character_level=False):
    if language is None:
        translator = Translator()
        if isinstance(text, list):
            language = translator.detect(text)[0].lang

    if language == "ro":
        if isinstance(text, list):
            text = [replace_diactitics(subtext) for subtext in text]
        else:
            text = replace_diactitics(text)

    if isinstance(text, list):
        if to_ascii:
            text = [
                unicodedata.normalize('NFKD',
                                      subtext).encode('ascii',
                                                      'ignore').decode("ascii")
                for subtext in text
            ]
        text = [subtext.lower() for subtext in text]
    else:
        if to_ascii:
            text = unicodedata.normalize('NFKD',
                                         text).encode('ascii',
                                                      'ignore').decode("ascii")
        text = text.lower()

    if language == "ro":
        stemmer = snowball.RomanianStemmer()
        if isinstance(text, list):
            words = [nltk.word_tokenize(subtext) for subtext in text]
        else:
            words = nltk.word_tokenize(text)
        procced_text = []

    elif language == "it":
        stemmer = snowball.ItalianStemmer()
        if isinstance(text, list):
            words = [nltk.word_tokenize(subtext) for subtext in text]
        else:
            words = nltk.word_tokenize(text)
        procced_text = []

    elif language == "en":
        stemmer = snowball.EnglishStemmer()
        if isinstance(text, list):
            words = [nltk.word_tokenize(subtext) for subtext in text]
        else:
            words = nltk.word_tokenize(text)
        procced_text = []
    else:
        if isinstance(text, list):
            words = [nltk.word_tokenize(subtext) for subtext in text]
        else:
            words = nltk.word_tokenize(text)
        procced_text = []

    stopw = []
    if language in stopwords:
        stopw = stopwords[language]

    if isinstance(text, list):
        for i in range(len(words)):
            sent = words[i]
            sentence = []
            if stem:
                for word in sent:
                    word = keep_only_letters(word)

                    if word not in stopw:
                        if character_level:
                            sentence += list(word)
                        else:
                            sentence.append(stemmer.stem(word))
            else:
                for word in sent:
                    word = keep_only_letters(word)
                    if word not in stopw:
                        if character_level:
                            sentence += list(word)
                        else:
                            sentence.append(word)
            procced_text.append(sentence)
    else:
        for word in words:
            word = keep_only_letters(word)
            if word not in stopw:
                if character_level:
                    procced_text += list(word)
                else:
                    procced_text.append(word)

    return procced_text
Example #15
0
def process_for_named_entity(text,
                             language,
                             to_ascii=True,
                             stem=False,
                             shorten=False):
    if language == "ro":
        if isinstance(text, list):
            text = [replace_diactitics(subtext) for subtext in text]
        else:
            text = replace_diactitics(text)

    if isinstance(text, list):
        if to_ascii:
            text = [
                unicodedata.normalize('NFKD',
                                      subtext).encode('ascii',
                                                      'ignore').decode("ascii")
                for subtext in text
            ]
        text = [subtext.lower() for subtext in text]
    else:
        if to_ascii:
            text = unicodedata.normalize('NFKD',
                                         text).encode('ascii',
                                                      'ignore').decode("ascii")
        text = text.lower()

    if language == "ro":
        stemmer = snowball.RomanianStemmer()
        if isinstance(text, list):
            words = [nltk.word_tokenize(subtext) for subtext in text]
        else:
            words = nltk.word_tokenize(text)
        procced_text = []

    elif language == "it":
        stemmer = snowball.ItalianStemmer()
        if isinstance(text, list):
            words = [nltk.word_tokenize(subtext) for subtext in text]
        else:
            words = nltk.word_tokenize(text)
        procced_text = []

    elif language == "en":
        stemmer = snowball.EnglishStemmer()
        if isinstance(text, list):
            words = [nltk.word_tokenize(subtext) for subtext in text]
        else:
            words = nltk.word_tokenize(text)
        procced_text = []
    else:
        if isinstance(text, list):
            words = [nltk.word_tokenize(subtext) for subtext in text]
        else:
            words = nltk.word_tokenize(text)
        procced_text = []

    if isinstance(text, list):
        for i in range(len(words)):
            sent = words[i]
            sentence = []
            if stem:
                for word in sent:
                    word = re.sub("[^a-z0-9]", "", word)
                    if word != '':
                        sentence.append(stemmer.stem(word))
            else:
                for word in sent:
                    word = re.sub("[^a-z0-9]", "", word)
                    if word != '':
                        sentence.append(word)
            procced_text.append(sentence)
    else:
        for word in words:
            word = re.sub("[^a-z0-9]", "", word)
            if word != '':
                if stem:
                    word = stemmer.stem(word)
                procced_text.append(word)

    if isinstance(text, list):
        for i in range(len(procced_text)):
            company_name = procced_text[i]
            if len(company_name) > 0 and company_name[0] != 'null':
                if False and company_name[-1] in [
                        'srl', 'ltd', 'spa', 'ltda', 'sl', 'snc'
                ]:
                    contracted = ' '.join(company_name[:-1])
                    if not check_if_text_in_language(
                            company_name[:-1]
                    ) and len(contracted) > 6 and not is_number(
                            contracted) and contracted not in [
                                'data', 'aprile', 'group', 'azienda',
                                'profilo', 'alumino', 'stato', 'roma',
                                'service', 'area', 'estate', 'date 4', 'work',
                                'altre', 'italia', 'stage', 'ottobre 2008',
                                'strada', '16 luglio', 'espresso', 'export',
                                'prime', 'sala', 'panelli'
                            ]:

                        del company_name[-1]
                        if shorten:
                            while len(contracted) > 23:
                                if len(contracted) - len(company_name[0]) < 15:
                                    break
                                del company_name[0]
                                contracted = ' '.join(procced_text)
                procced_text[i] = ' '.join(company_name)
    else:

        if len(procced_text) > 0 and procced_text[0] != 'null':
            if False and procced_text[-1] in [
                    'srl', 'ltd', 'spa', 'ltda', 'sl', 'snc'
            ]:
                contracted = ' '.join(procced_text[:-1])
                if not check_if_text_in_language(procced_text[:-1]) and len(
                        contracted) > 6 and not is_number(
                            contracted) and contracted not in [
                                'data', 'aprile', 'group', 'azienda',
                                'profilo', 'allumino', 'stato', 'roma',
                                'service', 'area', 'estate', 'metalmeccanica'
                                'date 4', 'work', 'castel'
                                'altre', 'italia', 'controlo qualita', 'stage',
                                'ottobre 2008', 'atena', 'strada', '16 luglio',
                                'industriale', 'espresso', 'export', 'prime',
                                'sala', 'panelli'
                            ]:
                    del procced_text[-1]
                    if shorten:
                        while len(contracted) > 23:
                            if len(contracted) - len(procced_text[0]) < 15:
                                break
                            del procced_text[0]
                            contracted = ' '.join(procced_text)

            procced_text = ' '.join(procced_text)

    return procced_text
Example #16
0
 def tokenizer(string):
     stemmer = snowball.EnglishStemmer(ignore_stopwords=True)
     regex = re.compile('\w\w+')
     return tuple(stemmer.stem(w) for w in regex.findall(string))
Example #17
0
def process(directory, save=".", threshold=0.7, include=False, subdirs=True):

    try:
        threshold = float(threshold)
    except ValueError:
        print("Invalid threshold value provided.")
        print("Using default 0.7")
        threshold = 0.7

    if threshold > 1:
        threshold /= 100

    # get filepaths

    filepaths = generate_file_list(directory, subdirs)

    # terminate program if no paths

    if len(filepaths) < 1:
        print("No files found")
        return "No files found"

    # terminate if only one file

    if len(filepaths) == 1:
        print("The comparison process requires at least two documents.")
        return "The comparison process requires at least two documents."

    # invalid files are reported in error report
    # valid files are processed further

    containers, invalid_files = list(), list()

    # record vocabulary

    vocabulary = list()

    stemmer = snowball.EnglishStemmer()

    for index, path in enumerate(filepaths, 1):

        try:
            file_contents = read_file(path)     # file can be opened
        except Exception:                       # file cannot be opened
            print("File %s could not be opened\n" % path)
            invalid_files.append((path, "File could not be opened."))
            continue

        if file_contents is False:  # file contents cannot be read
            invalid_files.append((path, "File is not in an acceptable format."))
            continue
        else:                       # file contents can be read
            print("\n")
            print("Parsing %s" % path)
            c = Container()
            c.set_path(path)
            c.set_index(index)

            # store raw file contents

            c.set_raw_text(file_contents)

            # retrieve named entities

            c.set_named_entities(get_named_entities(c.get_raw_text()))

            # print("Retrieved raw text")

            # tokenise file conents

            c.set_token_list(word_tokenize(c.get_raw_text()))

            # print("Tokenised")

            # only keep token stems if tokens aren't punctuation or stop words

            normalised = [stemmer.stem(token.lower()) for token in c.get_token_list()
                          if token not in punctuation and token.lower() not in stopwords.words("english")]

            # add named entities to list of normalised tokens

            [normalised.append(entity) for entity in c.get_named_entities()]

            c.set_normalised_text_list(normalised)

            # print("Normalised")

            # cannot compare files with no text, report error and skip

            if len(c.get_normalised_text_list()) < 1:
                print("File %s is either empty or contains no significant terms." % path)
                invalid_files.append((path, "File is either empty or contains no significant terms"))
                continue

            # add any new words to global vocabulary

            [vocabulary.append(token) for token in c.normalised if token not in vocabulary]

            # # initialise frequency dictionaries
            #
            # c.term_frequencies = dict()
            # c.inverse_document_frequencies = dict()

            # store container to allow further processing later

            containers.append(c)

            print("File %s parsed" % path)

    # refuse further processing if there aren't at least two files containing text

    if len(containers) < 2:
        return "At least two documents containing text are required."

    # collect normalised documents in a single list to be used in idf calculation

    print("\nGathering normalised documents...")

    normalised_documents = [container.get_normalised_text_list() for container in containers]

    # print("Normalised documents gathered")

    # calculate term frequencies and inverse document frequencies for each term in each file

    print("Calculating inverse document frequencies...")

    inverse_document_frequencies = dict()

    for word in vocabulary:
        inverse_document_frequencies[word] = calculate_idf(word, normalised_documents)

    print("Calculating term frequencies...")

    for container in containers:
        term_freqs = dict()
        for word in vocabulary:
            term_freqs[word] = container.normalised.count(word) / len(container.normalised)
        container.set_term_frequencies(term_freqs)

    # shallow-copying already existing idf dict is faster than recreating each time

    for container in containers:
        container.set_inverse_document_frequencies(inverse_document_frequencies.copy())

    # print("Calculated term frequencies and inverse document frequencies")
    print("Comparing documents...")

    results = []

    threshold *= 100

    # calculate similarity for every pair of documents

    for x in range(0, len(containers)):
        for y in range(x + 1, len(containers)):
            container_x = containers[x]
            container_y = containers[y]

            vector_x, vector_y = create_dense_vectors(container_x, container_y)

            if len(vector_x) == 0 or len(vector_y) == 0:
                similarity = 0.0
            else:
                similarity = round(cosine_similarity(vector_x, vector_y) * 100, 2)

            if similarity >= threshold or include:
                results.append((container_x.get_path(), container_y.get_path(), similarity))

    print("Ordering results...")

    results = order_similarity_tuples(results)

    print("Generating report...")

    # generate html string to write to file

    string = generate_html_string(results, invalid_files, threshold)

    # write html string to file
    write_path = "%s/similarity_report.html" % save
    try:
        write_string_to_file(string, write_path)
    except FileNotFoundError as e:
        write_path = "./similarity_report.html"
        write_string_to_file(string, write_path)

    print("Report generated!")

    return "Success"
Example #18
0
 def __init__(self, k=5, language='english'):
     self.documents = None
     self.k = k
     self.language = language
     self.stemmer = snowball.EnglishStemmer()
     self.stop_words = set(stopwords.words('english'))
Example #19
0
from bs4 import BeautifulSoup, NavigableString, Tag
from nltk.stem import snowball
import re
import datetime
import numpy as np

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import train_test_split

stemmers = [snowball.FrenchStemmer(), snowball.EnglishStemmer()]


def _vectorizer(tfidf):
    if tfidf:
        return TfidfVectorizer(analyzer="word", strip_accents='unicode',
                               tokenizer=None, encoding=u'utf-8',
                               preprocessor=None,
                               stop_words=None,
                               max_features=5000)
    else:
        return CountVectorizer(analyzer="word", strip_accents='unicode',
                               tokenizer=None, encoding=u'utf-8',
                               preprocessor=None,
                               stop_words=None,
                               max_features=5000)


def preprocess_posts(data, with_stemmer):
    print "Preprocessing posts..."
Example #20
0
#Spanish version of preprocessing
#Bing translator Spanish->English
#English stopwords
#English SnowballStemmer

import sys
import re
from nltk.corpus import stopwords
from mstranslator import Translator
#English and Spanish stemmer available
from nltk.stem import snowball
import string
import regex
#English because we translate first
stemmer = snowball.EnglishStemmer(ignore_stopwords=False)

translator = Translator('60864ac93121426d8fbbb1e2581a8c3e')

stop_words_list = []
flat_stop_words_list = []
exclusion_list_en_es = []
has_hashtag_or_mention = [False] * 800

punctuation = []
punctuation.append(list(string.punctuation[2:6]))
punctuation.append(string.punctuation[9])
punctuation.append(list(string.punctuation[20:22]))


def make_stop_words_list():
    #exclude words which are in both dictionaries
Example #21
0
 def test_stem_document(self):
     before = ["Computer", "Science"]
     after = TextProcessing.stem_document(before, snowball.EnglishStemmer())
     assert after[0] == "comput" and after[1] == "scienc"
Example #22
0
def __getStemmer(language):
    return {
        'russian': snowball.RussianStemmer(),
        'english': snowball.EnglishStemmer()
    }.get(language)
Example #23
0
def main(out_dir, source, years):
    page = requests.get(
        "http://storage.googleapis.com/books/ngrams/books/datasetsv2.html"
    )  # gets the urls of the 1gram datafiles
    pattern = re.compile('href=\'(.*%s-%s-%s-.*\.gz)' %
                         (source, TYPE, VERSION))
    urls = pattern.findall(page.text)
    del page

    year_counts = {
    }  # These are dicts that contain the occurence of a word in each year
    year_doc_counts = {}
    year_pos = {}
    for year in years:
        year_pos[year] = {
        }  # Counts the occurrence of a word (distinguish words by pos)
        year_counts[year] = {
        }  # Counts the occurrence of a word (does not distinguish words by pos)
        year_doc_counts[year] = {}  # Counts the books where the word occurred

    print "Start loop"
    for url in urls:  # iterates through the urls
        name = re.search('%s-(.*).gz' % VERSION, url).group(1)
        print "Downloading", name

        success = False
        while not success:  # downloads the acutal datafile
            with open(out_dir + name + '.gz', 'w') as f:
                try:
                    f.write(urllib2.urlopen(url, timeout=60).read())
                    success = True
                except:
                    continue

        print "Unzipping", name  # unzips the downloaded datafile
        subprocess.call(['gunzip', '-f', out_dir + name + '.gz', '-d'])

        print "Going through", name  # iterates through the lines of the datafile and counts the uccurrence of the words
        with open(out_dir + name) as f:
            for l in f:
                try:
                    split = l.strip().split('\t')
                    if not POS.match(split[0]):
                        continue
                    count = int(split[2])
                    if count < 10:
                        continue
                    word_info = split[0].split("_")
                    pos = word_info[-1]
                    word = word_info[0].decode('utf-8').lower()
                    word = word.strip("\"")
                    word = word.split("\'s")[0]
                    if not word.isalpha():
                        continue
                    esb = snowball.EnglishStemmer()
                    word = str(esb.stem(word))
                    year = int(split[1])
                    doc_count = int(split[3])
                    if not year in years:
                        continue
                    if not word in year_counts[year]:
                        year_counts[year][word] = 0
                        year_doc_counts[year][word] = 0
                        year_pos[year][word] = collections.Counter()
                    year_counts[year][word] += count
                    year_doc_counts[year][word] += doc_count
                    year_pos[year][word][pos] += count
                except UnicodeDecodeError:
                    pass

        print "Deleting", name  # deletes the downloaded files
        try:
            os.remove(out_dir + name)
            os.remove(out_dir + name + '.gz')
        except:
            pass

    print "Writing..."  # writes the data into pkl files
    for year in years:
        ioutils.write_pickle(year_counts[year],
                             out_dir + str(year) + "-counts.pkl")
        ioutils.write_pickle(year_doc_counts[year],
                             out_dir + str(year) + "-doc_counts.pkl")
        ioutils.write_pickle(year_pos[year], out_dir + str(year) + "-pos.pkl")
Example #24
0
def stem(word, stemmer=snowball.EnglishStemmer()):
    """Stem a word using Snowball by default.
    """
    return stemmer.stem(word)
Example #25
0
# Stemming....
# Probably best to preprocess the whole data set once
#  and stem it (and remove URLs) if stemming makes a big enough difference.
#
# Stemming in Vectorizer subclasses:
# See: https://stackoverflow.com/questions/36182502/add-stemming-support-to-countvectorizer-sklearn
# This is subtle:
# Vectorizers have build_preprocessor() method that returns a preprocessor()
#   function.
# The preprocessor() function is called for each document (string) to do any
#   preprocessing, returning string.
# What we do here:    Subclass each of the common Vectorizers
#  and override the build_preprocessor() method to return a stemming
#    preprocessor function.
# ---------------------------
stemmer = nltk.EnglishStemmer()
token_re = re.compile("\\b([a-z_]\w+)\\b", re.IGNORECASE)  # match words


class StemmedCountVectorizer(CountVectorizer):
    def build_preprocessor(self):  # override super's build_preprocessor method
        '''
        Return preprocessor function that stems.
        '''
        # get the super class's preprocessor function for this object.
        preprocessor = super(type(self), self).build_preprocessor()

        # Tokenize and stem the string returned by the super's preprocessor
        #   method.
        # This should stem all words in  {bi|tri|...}grams and preserve any
        #  functionality implemented in the preprocessor.
class SampleRecord(object):
    """
    Represents a training sample or a sample to predict.
    A training sample has a known class that it belongs to,
    A sample to predict may or may not have a known class (sometimes we
	do predictions on samples for which we know what class they belong to)
    Knows how to take a text representation of a record (typically a
	text string with delimitted fields) and parse into its fields
    Provides various methods to preprocess a sample record (if any)

    A SampleRecord can be marked as "reject". Has rejectReason, ...
    """
    def __init__(
        self,
        s,
    ):

        self.rejected = False
        self.rejectReason = None
        self.parseInput(s)

    #----------------------

    def parseInput(self, s):
        fields = s.split(FIELDSEP)

        if len(fields) == 6:  # have known class name as 1st field
            self.knownClassName = fields[0]
            fields = fields[1:]
        else:
            self.knownClassName = None

        self.ID = str(fields[0])
        self.isDiscard = str(fields[1])
        self.status = fields[2]
        self.journal = fields[3]
        self.doc = self.constructDoc(fields[4])

    #----------------------

    def constructDoc(self, text):
        # Do what needs to be done to construct the text portion
        return text

    #----------------------

    def getSampleAsText(self):
        # return this record as a text string

        if self.rejected: return None

        if self.knownClassName:
            fields = [self.knownClassName]
        else:
            fields = []
        fields += [
            self.ID,
            self.isDiscard,
            self.status,
            self.journal,
            self.doc,
        ]
        return FIELDSEP.join(fields) + RECORDSEP

    #----------------------

    def getSampleName(self):
        return self.ID

    def getDiscard(self):
        return self.isDiscard

    def getStatus(self):
        return self.status

    def getJournal(self):
        return self.journal

    def getDocument(self):
        return self.doc

    def getKnownClassName(self):
        return self.knownClassName

    def isReject(self):
        return self.rejected

    def getRejectReason(self):
        return self.rejectReason

    #----------------------
    # "Preprocessor" functions.
    #  Each preprocessor should modify this sample and return itself
    #----------------------
    refRemover = RefSectionRemover(maxFraction=0.4)  # finds ref sections

    def removeRefSection(self):
        self.doc = SampleRecord.refRemover.getBody(self.doc)
        return self

    # ---------------------------

    miceRegex = re.compile(r'\bmice\b', flags=re.IGNORECASE)

    def rejectIfNoMice(self):
        if not SampleRecord.miceRegex.search(self.doc):
            self.rejected = True
            self.rejectReason = "Mice not found"
        return self

    # ---------------------------

    urls_re = re.compile(r'\bhttps?://\S*', re.IGNORECASE)  # match URLs
    token_re = re.compile(r'\b([a-z_]\w+)\b')  # match lower case words
    stemmer = nltk.EnglishStemmer()

    def removeURLsCleanStem(self):
        '''
	Remove URLs and punct, lower case everything,
	Convert '-/-' to 'mut_mut',
	Keep tokens that start w/ letter or _ and are 2 or more chars.
	Stem,
	Replace \n with spaces
	'''
        output = ''

        for s in SampleRecord.urls_re.split(self.doc):  # split and remove URLs
            s = s.replace('-/-', ' mut_mut ').lower()
            for m in SampleRecord.token_re.finditer(s):
                output += " " + SampleRecord.stemmer.stem(m.group())
        self.doc = output
        return self

    # ---------------------------

    def removeURLs(self):
        '''
	Remove URLs, lower case everything,
	Convert '-/-' to 'mut_mut',
	'''
        output = ''

        for s in SampleRecord.urls_re.split(self.doc):
            s = s.replace('-/-', ' mut_mut ').lower()
            output += ' ' + s
        self.doc = output
        return self

    # ---------------------------

    def addJournalFeature(self):
        '''
	add the journal name as a text token to the document
	'''
        jtext = 'journal__' + '_'.join(self.journal.split(' ')).lower()
        self.doc += " " + jtext
        return self

    # ---------------------------

    def truncateText(self):
        # for debugging, so you can see a sample record easily
        self.doc = self.doc[:50]
        return self