Example #1
0
    def setUpClass(self):

        self.proceedings = get_proceedings(1981, 2016)

        # random subset
        np.random.shuffle(self.proceedings)
        self.proceedings = self.proceedings[0:100]

        # test that the combination runs without errs
        clean_papers(self.proceedings)
Example #2
0
def go_hard(papers=None):
    if papers == None:
        papers = get_proceedings()
    res = {}
    for paper in papers:
        res[paper] = []
        sentences = nltk.sent_tokenize(paper.clean_text)
        for i, sentence in enumerate(sentences):
            grammar_sentences = parse_sentence(sentence)
            res[paper] = res[paper] + grammar_sentences

    return res
def go_hard():
    papers = get_proceedings()
    res = {}
    for paper in papers:
        res[paper] = []
        sentences = nltk.sent_tokenize(paper.clean_text)
        for sentence in sentences:
            grammar_sentences = parse_sentence(sentence)

            res[paper].extend(grammar_sentences)

    for paper in res.keys():
        print 70 * "*"
        print paper.title, paper.year, paper.DOI
        grammar = res[paper]
        for grammar_sentences in grammar:
            print grammar
Example #4
0
    cleaned = []

    for paper in proceedings:
        cleaner = CleanText(paper)
        cleaner.text = cleaner.remove_symbols()
        cleaner.text = cleaner.remove_references()
        cleaner.text = cleaner.remove_until_abstract()
        cleaner.text = cleaner.remove_l_listings()
        cleaner.text = cleaner.stitch_newlines()
        cleaner.text = cleaner.remove_sessions(ses=ses, regs=regs)
        cleaner.text = cleaner.remove_meta()
        cleaner.text = cleaner.remove_misc()
        cleaner.text = cleaner.remove_header()
        cleaner.text = cleaner.remove_in_text_references()
        cleaner.text = cleaner.stitch_newlines()
        cleaner.text = cleaner.fix_hyphen()
        cleaner.text = cleaner.fix_ofthe()
        cleaner.text = cleaner.remove_month_year_lines()
        cleaner.text = cleaner.remove_math_and_punctuation_lines()

        cleaned.append(cleaner.text)

    return cleaned


if __name__ == "__main__":

    proceedings = get_proceedings()
    res = clean_papers(proceedings)
from nltk.corpus import wordnet as wn
wordnet_lemmatizer = WordNetLemmatizer()


stopwords = set(
    stopwords.words('english') +
    ["eg","the","of","to","and","a","in","for","that","is","we","with","as","on","this","be","are","by","was","or","an","were","it","from","their","not","they","our","can","more","have","at","which","these","one","also","when","each","how","used","use","other","such","but","between","all","using","two","than","about","would","has","different","work","i","could","may","had","while","if","there","some","only","into","both","what","new","been","most","however","will","who","where","them","its","first","because","many","three","then","no","so","figure","paper","within","like","well","way","might","literature","see","open","even","approach","1","2","3","particular","ways","rather","make","often","found", "without", "provide", "thus", "significant", "ie", "u", "mean", "term", "difference", "whole", "another", "second", "first", "third", "others", "previous", "much", "specific", "across", "important", "take", "=", "4", "although", "several", "current", "better", "one", "range", "main", "page", "whether", "le", "able", "due", "include", "existing", "allow", "line", "rate", "allows", "given", "single", "addition", "benefit", "four", "instance", "5", "provides", "us", "developed", "6", "b", "providing", "&", "discussion", "therefore", "describe", "v", "/", "c", "al", "finally", "yet"]
)

def e(i):
    return str(i).encode('ascii', 'ignore').lower().replace(",","").replace(".", "").replace(";","").replace(":","").replace("(","").replace(")","").replace("'s","")

all_nouns = set([x.name().split('.', 1)[0] for x in wn.all_synsets('n')])

if __name__ == "__main__":
    proceedings = get_proceedings(min_year=1980, max_year=2019)

    keywords = set(["embodied", "embody", "body", "bodies"])
    cnt = Counter()
    nouns = Counter()

    for paper in proceedings:

        sentences = sent_tokenize(paper.clean_text)

        for sentence in sentences:
            words = e(sentence).split()

            for word in words:
                if word in keywords:
                    r = [wordnet_lemmatizer.lemmatize(w) for w in words if w not in stopwords.union(keywords)]
Example #6
0
dictionary = Counter()
dictionary.update([i.rstrip()
                   for i in words(open('cleantext/etc/dict.txt').read())])
lemma = nltk.wordnet.WordNetLemmatizer()

freq = {}
freq2 = {}

try:
    # try loading a cached version first;
    # make it and store it, if it does not exist
    freq = pickle.load(open("pkls/freq.pkl"))
    freq2 = pickle.load(open("pkls/freq2.pkl"))
except Exception:
    for paper in get_proceedings():
        mywords = words(paper.text)
        for word in mywords:
            if word in freq2:
                freq2[word] += 1
            else:
                freq2[word] = 1

        words_unique = list(set(mywords))
        for word in words_unique:
            if word in freq:
                freq[word] += 1
            else:
                freq[word] = 1

    pickle.dump(freq, open("pkls/freq.pkl", 'w'))