def get_nps_from_papers(ids): npps = [] for iid in ids: if not os.path.exists(os.path.join("NPs", iid+".xml")): extract_nps.extract_nps_single_paper(iid) with codecs.open(os.path.join("NPs", str(iid)+".xml"), mode='r', encoding='utf-8') as npfile: nps = dict([line.strip().split("\t") for line in npfile]) npps.append(nps) return npps
for iid in ids: year = f.get_year_for_id(iid) if year > YEAR: after.append(iid) else: before.append(iid) print len(before) print len(after) # Find all NPs in the after and before sets after_nps = set() for iid in after: print iid if not os.path.exists("NPs/"+iid+".xml"): extract_nps.extract_nps_single_paper(iid) npfile = open("NPs/"+iid+".xml", 'r') for line in npfile: np, count = line.strip().split("\t") after_nps.add(np) before_nps = set() for iid in before: print iid if not os.path.exists("NPs/"+iid+".xml"): extract_nps.extract_nps_single_paper(iid) npfile = open("NPs/"+iid+".xml", 'r') for line in npfile: