def process(file_path): before_count = Counter() cur_count = Counter() after_count = Counter() with open(file_path, 'r') as content: line_gen = (line.strip() for line in content.readlines()) refgen = (ExRef(line) for line in line_gen) for idx, ref in enumerate(refgen): sent_before = ref.get_sentence(-1) sent_cur = ref.get_sentence(0) sent_after = ref.get_sentence(1) if idx % 100 == 0: print(idx) before_count += Counter(smart_tokenize(sent_before)) cur_count += Counter(smart_tokenize(sent_cur)) after_count += Counter(smart_tokenize(sent_after)) return (before_count, cur_count, after_count)
def find_cits_with_sentiment_words(file_path): ret = set() for idx, ref in enumerate(ref_gen(file_path)): if idx % 100 == 0: print(idx) tokens = set(smart_tokenize(ref.get_sentence())) intersect = tokens.intersection(sentiment_words) if len(intersect) <> 0: ret.add(ref, list(intersect)) return ret
def count_words_in_context(file_path, popular_pmids): word_count = defaultdict(Counter) for idx, ref in enumerate(ref_gen(file_path)): if idx % 100 == 0: print(idx) if ref.cited_id not in popular_pmids: continue citing_sentence = ref.get_sentence(0) tokens = smart_tokenize(citing_sentence) word_count[ref.cited_id] += Counter(tokens) return word_count