def setUpClass(self): self.proceedings = get_proceedings(1981, 2016) # random subset np.random.shuffle(self.proceedings) self.proceedings = self.proceedings[0:100] # test that the combination runs without errs clean_papers(self.proceedings)
def go_hard(papers=None): if papers == None: papers = get_proceedings() res = {} for paper in papers: res[paper] = [] sentences = nltk.sent_tokenize(paper.clean_text) for i, sentence in enumerate(sentences): grammar_sentences = parse_sentence(sentence) res[paper] = res[paper] + grammar_sentences return res
def go_hard(): papers = get_proceedings() res = {} for paper in papers: res[paper] = [] sentences = nltk.sent_tokenize(paper.clean_text) for sentence in sentences: grammar_sentences = parse_sentence(sentence) res[paper].extend(grammar_sentences) for paper in res.keys(): print 70 * "*" print paper.title, paper.year, paper.DOI grammar = res[paper] for grammar_sentences in grammar: print grammar
cleaned = [] for paper in proceedings: cleaner = CleanText(paper) cleaner.text = cleaner.remove_symbols() cleaner.text = cleaner.remove_references() cleaner.text = cleaner.remove_until_abstract() cleaner.text = cleaner.remove_l_listings() cleaner.text = cleaner.stitch_newlines() cleaner.text = cleaner.remove_sessions(ses=ses, regs=regs) cleaner.text = cleaner.remove_meta() cleaner.text = cleaner.remove_misc() cleaner.text = cleaner.remove_header() cleaner.text = cleaner.remove_in_text_references() cleaner.text = cleaner.stitch_newlines() cleaner.text = cleaner.fix_hyphen() cleaner.text = cleaner.fix_ofthe() cleaner.text = cleaner.remove_month_year_lines() cleaner.text = cleaner.remove_math_and_punctuation_lines() cleaned.append(cleaner.text) return cleaned if __name__ == "__main__": proceedings = get_proceedings() res = clean_papers(proceedings)
from nltk.corpus import wordnet as wn wordnet_lemmatizer = WordNetLemmatizer() stopwords = set( stopwords.words('english') + ["eg","the","of","to","and","a","in","for","that","is","we","with","as","on","this","be","are","by","was","or","an","were","it","from","their","not","they","our","can","more","have","at","which","these","one","also","when","each","how","used","use","other","such","but","between","all","using","two","than","about","would","has","different","work","i","could","may","had","while","if","there","some","only","into","both","what","new","been","most","however","will","who","where","them","its","first","because","many","three","then","no","so","figure","paper","within","like","well","way","might","literature","see","open","even","approach","1","2","3","particular","ways","rather","make","often","found", "without", "provide", "thus", "significant", "ie", "u", "mean", "term", "difference", "whole", "another", "second", "first", "third", "others", "previous", "much", "specific", "across", "important", "take", "=", "4", "although", "several", "current", "better", "one", "range", "main", "page", "whether", "le", "able", "due", "include", "existing", "allow", "line", "rate", "allows", "given", "single", "addition", "benefit", "four", "instance", "5", "provides", "us", "developed", "6", "b", "providing", "&", "discussion", "therefore", "describe", "v", "/", "c", "al", "finally", "yet"] ) def e(i): return str(i).encode('ascii', 'ignore').lower().replace(",","").replace(".", "").replace(";","").replace(":","").replace("(","").replace(")","").replace("'s","") all_nouns = set([x.name().split('.', 1)[0] for x in wn.all_synsets('n')]) if __name__ == "__main__": proceedings = get_proceedings(min_year=1980, max_year=2019) keywords = set(["embodied", "embody", "body", "bodies"]) cnt = Counter() nouns = Counter() for paper in proceedings: sentences = sent_tokenize(paper.clean_text) for sentence in sentences: words = e(sentence).split() for word in words: if word in keywords: r = [wordnet_lemmatizer.lemmatize(w) for w in words if w not in stopwords.union(keywords)]
dictionary = Counter() dictionary.update([i.rstrip() for i in words(open('cleantext/etc/dict.txt').read())]) lemma = nltk.wordnet.WordNetLemmatizer() freq = {} freq2 = {} try: # try loading a cached version first; # make it and store it, if it does not exist freq = pickle.load(open("pkls/freq.pkl")) freq2 = pickle.load(open("pkls/freq2.pkl")) except Exception: for paper in get_proceedings(): mywords = words(paper.text) for word in mywords: if word in freq2: freq2[word] += 1 else: freq2[word] = 1 words_unique = list(set(mywords)) for word in words_unique: if word in freq: freq[word] += 1 else: freq[word] = 1 pickle.dump(freq, open("pkls/freq.pkl", 'w'))