def populateLocalCache(self): """ iterates over document entries in self.datadict, retrievs documents from net if not already cached, extracts text from documents if not already cached """ logger.info("Populating local document cache, retrieving from net as needed") for (k, d) in sorted(self.datadict.iteritems()): basename = utils.get_base_name(d["url"]) fullpath = os.path.join(settings.DATADIR, basename) fulltxtpath = os.path.join(settings.DATADIR, basename.split(".")[0] + ".txt") if not os.path.exists(fulltxtpath) and not os.path.exists(fullpath): logger.info("Retrieving %s into %s" % (d["url"], settings.DATADIR)) with open(fullpath, "wb") as f: f.write(urllib.urlopen(d["url"]).read()) pass if not os.path.exists(fulltxtpath): cmd = "pdftotext -f 1 -l 5 %s -" % fullpath logger.info("converting %s to text" % fullpath) p = subprocess.Popen(cmd.strip().split(" "), stdout=subprocess.PIPE) (contents, errf) = p.communicate() with codecs.open(fulltxtpath, "wb", encoding="utf-8") as f: f.write(contents.decode("utf-8")) if settings.DELETE_PDF_AFTER_EXTRACTION: os.unlink(fullpath)
def findNotMatched(datadict,matches): # save data of orphan documents separately, for forensics. not_matched = {x['url'] for x in datadict.values()}.difference({x['url'] for x in matches}) not_matched = [x for x in datadict.values() if x['url'] in not_matched] for (i, v) in enumerate(not_matched): not_matched[i].update({'docid': utils.get_base_name(v['url'])}) return not_matched
def populateLocalCache(self): """ iterates over document entries in self.datadict, retrievs documents from net if not already cached, extracts text from documents if not already cached """ logger.info("Populating local document cache, retrieving from net as needed") for (k, d) in sorted(self.datadict.iteritems()): basename = utils.get_base_name(d['url']) fullpath = os.path.join(settings.DATADIR, basename) fulltxtpath = os.path.join(settings.DATADIR, basename.split('.')[0] + ".txt") if not os.path.exists(fulltxtpath) and not os.path.exists(fullpath): logger.info("Retrieving %s into %s" % (d['url'], settings.DATADIR)) with open(fullpath, "wb") as f: f.write(urllib.urlopen(d['url']).read()) pass if not os.path.exists(fulltxtpath): cmd = "pdftotext -f 1 -l 5 %s -" % fullpath logger.info("converting %s to text" % fullpath) p = subprocess.Popen(cmd.strip().split(' '), stdout=subprocess.PIPE) (contents, errf) = p.communicate() with codecs.open(fulltxtpath, "wb", encoding='utf-8') as f: f.write(contents.decode('utf-8')) if settings.DELETE_PDF_AFTER_EXTRACTION: os.unlink(fullpath)
def score(score_threshold, d): """ fuzzy match between all the records in identities """ """ and all the lines present inside d['candidates'] """ results = [] for heading in d['candidates']: cand = [{ 'docid': utils.get_base_name(d['url']), 'url': d['url'], 'title': d['title'], 'date': d['date'], 'score': 0 if len(heading) < 6 else fuzz.partial_ratio(entityName, heading), 'entityName': entityName, 'id': id, 'heading': heading } for (entityName, id) in identities] results.append([x for x in cand if x['score'] > score_threshold]) return results
def remove_duplicate_domains(self): """ Filter out redundant domains, that is remove all but the first of the following domains: IPR001433 GO:0016491 IPR001709 GO:0016491 IPR001834 GO:0016491 Parameters ---------- Returns ------- pandas.DataFrame dataframe with domains with unique GO label """ print("Filtering out duplicate domains.") dom2go = read_csv(join(self.data_path, self.interpro2go_tab), sep="\t", header=0) num_dom = dom2go.shape[0] unique_domains_name = get_base_name( self.interpro2go_tab) + "_unique.tab" uniq_dom2go = dom2go.drop_duplicates(["GO_terms"]) uniq_dom2go.to_csv(join(self.data_path, unique_domains_name), sep="\t", index=False) num_uniq_dom = uniq_dom2go.shape[0] print("Reducing domains from {} to {}.".format(num_dom, num_uniq_dom)) return uniq_dom2go
def create_random_set_multi(src_lang, trg_lang, work_dir, set_num): broad_samples_dir = os.path.join(work_dir, "broad-samples") #sample_sizes=utils.get_immediate_subdirectories(broad_samples_dir) sample_dirs = utils.sub_dir_path(broad_samples_dir) random_dir = os.path.join(work_dir, "random-sets") for sample_dir in sample_dirs: sample_size = utils.get_base_name(sample_dir) sample_out_dir = os.path.join(random_dir, sample_size) samples_name = utils.get_immediate_subfiles(sample_dir) name_list = Set() for sample in samples_name: name_list.add(sample.split('.')[0]) for name in name_list: src_in_file = os.path.join(work_dir, "broad-samples", sample_size, ''.join([name, '.', src_lang])) trg_in_file = os.path.join(work_dir, "broad-samples", sample_size, ''.join([name, '.', trg_lang])) create_random_set_single(src_lang, trg_lang, src_in_file, trg_in_file, set_num, sample_out_dir)
def save_rand_comb(self, num_comb, uniq_dom2go): """ Pick num_comb random combinations from the domains column of uniq_dom2go dataframe Parameters ---------- num_comb : int number of combination to pick uniq_dom2go : pandas.DataFrame dataframe of domains with unique GO terms Returns ------- None """ num_uniq_dom = uniq_dom2go.shape[0] print("Pick {} random combinations of the {} domains and save them.". format(num_comb, num_uniq_dom)) rand_combos = choose_combos(num_uniq_dom, 2, num_comb) # save dataframe with the combinations rand_comb_name = get_base_name(self.interpro2go_tab) + "_rand_comb.csv" with open(join(self.data_path, rand_comb_name), 'w') as rand_comb_file: combo_domains_header = [ "interpro_id1", "interpro_id2", "gos_id1", "gos_id2" ] writer = csv.writer(rand_comb_file, delimiter=',') writer.writerow(combo_domains_header) for rand_combo in rand_combos: dom_combo = [ str(uniq_dom2go.iloc[rand_combo[0]].interpro_ids), str(uniq_dom2go.iloc[rand_combo[1]].interpro_ids), str(uniq_dom2go.iloc[rand_combo[0]].GO_terms), str(uniq_dom2go.iloc[rand_combo[1]].GO_terms) ] writer.writerow(dom_combo)
def get_scene_file_name(self): """Get full file name without extensions. This name will be used to create folder name to where we save the outfile. """ file_name = u.get_base_name(self.file_path).split('.')[0] return file_name
def bleu_intervals(in_dir, level, single_bleu_dir): st_builder=[] #test = np.array([9,1,3,4,8,7,2,5,6,0]) for sample_dir in utils.sub_dir_path(in_dir): #print sample_dir st_builder.append(sample_dir) #test = np.array([9,1,3,4,8,7,2,5,6,0]) #scores=np.zeros(1) bleu_arr=[] #min_score=100 #max_score=0 for file in utils.get_immediate_subfiles(sample_dir): #print file if file.endswith(".bleu"): path=os.path.join(sample_dir, file) score=read_decoding_score(path) bleu=float(score) #np.append(scores, bleu) bleu_arr.append(bleu) # if bleu<min_score: # min_score=bleu # if bleu>max_score: # max_score=bleu #st_builder.append(score) scores=np.array(bleu_arr) interval=confidence_intervals(scores, level) #print scores st_builder.append("input scores") #st_builder.append(scores) #for score in scores: # st_builder.append("%f "%(score)) #st_builder.append("\n") st_builder.append("min=%f max=%f"%(min(scores), max(scores))) st_builder.append("level: %d"%(level)) st_builder.append("bleu intervals:") #st_builder.append(interval) #for score in interval: # st_builder.append("%f "%(score)) #st_builder.append("\n") st_builder.append("interval: %f %f"%(min(interval), max(interval))) #st_builder.append('\n') # single bleu score single_bleu_file_name=''.join(['sample-',utils.get_base_name(sample_dir),'.bleu']) single_bleu_path=os.path.join(single_bleu_dir, single_bleu_file_name) true_bleu=read_decoding_score(single_bleu_path) st_builder.append("true bleu: %s\n"%(true_bleu)) return st_builder
def findNotMatched(datadict, matches): # save data of orphan documents separately, for forensics. not_matched = {x['url'] for x in datadict.values() }.difference({x['url'] for x in matches}) not_matched = [x for x in datadict.values() if x['url'] in not_matched] for (i, v) in enumerate(not_matched): not_matched[i].update({'docid': utils.get_base_name(v['url'])}) return not_matched
def getDocumentLines(self, k): d = self.datadict[k] basename = utils.get_base_name(d["url"]) fullpath = os.path.join(settings.DATADIR, basename) fulltxtpath = os.path.join(settings.DATADIR, basename.split(".")[0] + ".txt") logger.debug("Loading cached text for %s from %s" % (fullpath, fulltxtpath)) with codecs.open(fulltxtpath, encoding="utf-8") as f: contents = f.read().encode("utf-8") lines = self.sanitize_lines(contents.split("\n")) return lines
def getDocumentLines(self, k): d = self.datadict[k] basename = utils.get_base_name(d['url']) fullpath = os.path.join(settings.DATADIR, basename) fulltxtpath = os.path.join(settings.DATADIR, basename.split('.')[0] + ".txt") logger.debug("Loading cached text for %s from %s" % (fullpath, fulltxtpath)) with codecs.open(fulltxtpath, encoding='utf-8') as f: contents = f.read().encode('utf-8') lines = self.sanitize_lines(contents.split("\n")) return lines
def score(score_threshold, d): """ fuzzy match between all the records in identities """ """ and all the lines present inside d['candidates'] """ results = [] for heading in d['candidates']: cand = [{'docid': utils.get_base_name(d['url']), 'url': d['url'], 'title': d['title'], 'date': d['date'], 'score': 0 if len(heading) < 6 else fuzz.partial_ratio(entityName, heading), 'entityName': entityName, 'id': id, 'heading': heading} for (entityName, id) in identities] results.append([x for x in cand if x['score'] > score_threshold]) return results
def get_go_labels(self): """ Get go labels for the whole domains2GO dataframe Parameters ---------- Returns ------- None """ print("Get labels for GOs.") dom2go = read_csv(join(self.data_path, self.interpro2go_tab), sep="\t", header=0) dom2go_labels = dom2go.apply(self.extract_go_labels, axis=1) domains_with_labels = get_base_name( self.interpro2go_tab) + "_labels.csv" dom2go_labels.to_csv(join(self.data_path, domains_with_labels), sep=",", index=False)
def update_clients_from_server(self, sess, clients, update_vars_type=utils.VARS_TYPE_ALL): """Updates clients vars from server vars. Args: sess: TF Session. clients: A list of clients that will be updated from server. update_vars_type: String. Options: utils.VARS_TYPE_ALL means all vars, utils.VARS_TYPE_SHARED means shared vars. Raises: ValueError: Unknown update_vars_type. """ if update_vars_type == utils.VARS_TYPE_ALL: server_vars = sess.run(self.server.read_ops_all_vars) client_update_ops = [c.update_ops_all for c in clients] client_update_ops_feed_dict = {} for c in clients: for var_base_name, placeholder in c.dict_update_placeholders.items( ): client_update_ops_feed_dict[placeholder] = np.array( [server_vars[var_base_name]]) elif update_vars_type == utils.VARS_TYPE_SHARED: server_shared_vars = sess.run(self.server.read_ops_shared_vars) client_update_ops = [c.update_ops_shared for c in clients] client_update_ops_feed_dict = {} for c in clients: for shared_var in c.model_train.shared_vars: var_base_name = utils.get_base_name(shared_var) placeholder = c.dict_update_placeholders[var_base_name] client_update_ops_feed_dict[placeholder] = np.array( [server_shared_vars[var_base_name]]) else: raise ValueError('Unknown vars update type: %s' % update_vars_type) sess.run(client_update_ops, feed_dict=client_update_ops_feed_dict)
def convert_to_tab(self, keep_only_MF): """ Convert mapping of interpro to GOs into tabular file For each interpro domain in species file, read all GOs and arrange them as the column of the row Parameters ---------- keep_only_MF : bool keep only molecular function GO annotations (True), otherwise (False) Returns ------- None """ print("Converting to tabs.") self.read_species_domains() interpro2go_tab = get_base_name( self.interpro2go ) + "_" + self.species_name + "_MF.tab" if keep_only_MF else ".tab" self.interpro2go_tab = interpro2go_tab num_written_lines = 0 with open(self.interpro2go, 'r') as interpro2go_file, open( join(self.data_path, interpro2go_tab), 'w') as interpro2go_tab_file: interpro2go_tab_file.write("interpro_ids\tGO_terms\n") previous_id = " " previous_go_terms = [] for interpro2go_line in interpro2go_file: if interpro2go_line[0] != "!": current_id = interpro2go_line.strip().split( "InterPro:")[1].split(" ")[0] assert current_id[: 3] == "IPR", "AssertionError: interpro id must start with IPR.\n line: {}".format( interpro2go_line) current_go_term = interpro2go_line.strip().split(" ; ")[-1] if keep_only_MF and (current_go_term in self.go_db and self.go_db[current_go_term].namespace != "molecular_function"): continue if previous_id == " ": # init previous_go_terms.append(current_go_term) previous_id = current_id else: if current_id == previous_id: # still in the same interpro domain previous_go_terms.append(current_go_term) else: # on another interpro domain assert previous_id != " ", "AssertionError: id must not be null.\n line: {}".format( interpro2go_line) assert len( previous_go_terms ) > 0, "AssertionError: each interpro should have at least one GO.\n line:{}".format( interpro2go_line) if previous_id in self.species_domains_dict: interpro2go_tab_file.write( previous_id + '\t' + " ".join(previous_go_terms) + "\n") num_written_lines = num_written_lines + 1 previous_id = current_id previous_go_terms = [current_go_term] print("Saved {} interpro2GO tabs in {}.".format( num_written_lines, interpro2go_tab))
def update_server_from_clients(self, sess, clients, update_vars_type=utils.VARS_TYPE_ALL): """Updates server vars to be the weighted average of client vars. Args: sess: TF Session. clients: A list of clients that will be used to update server. update_vars_type: String. Options: utils.VARS_TYPE_ALL means all vars, utils.VARS_TYPE_SHARED means shared vars. Raises: ValueError: Unknown update_vars_type. """ num_clients = len(clients) total_num_batches = 0 for c in clients: total_num_batches += c.model_train.data.num_batches # client_weights should sum to num_clients. client_weights = [ float(c.model_train.data.num_batches * num_clients / total_num_batches) for c in clients ] if update_vars_type == utils.VARS_TYPE_ALL: read_client_ops = collections.defaultdict(list) for var_base_name in self.server.model_train.var_dict: for c in clients: read_client_ops[var_base_name].append( c.read_ops_all_vars[var_base_name]) client_vars = sess.run(read_client_ops) for cid, c in enumerate(clients): weight = client_weights[cid] for var_base_name in self.server.model_train.var_dict: client_vars[var_base_name][cid] *= weight server_feed_dict = {} for (var_base_name, placeholder) in self.server.dict_update_placeholders.items(): client_vars_as_array = np.array(client_vars[var_base_name]) server_feed_dict[placeholder] = client_vars_as_array sess.run(self.server.update_ops_all, feed_dict=server_feed_dict) elif update_vars_type == utils.VARS_TYPE_SHARED: read_client_ops = collections.defaultdict(list) for v in self.server.model_train.shared_vars: var_base_name = utils.get_base_name(v) for c in clients: read_client_ops[var_base_name].append( c.read_ops_shared_vars[var_base_name]) client_vars = sess.run(read_client_ops) for cid, c in enumerate(clients): weight = client_weights[cid] for shared_var in self.server.model_train.shared_vars: var_base_name = utils.get_base_name(shared_var) client_vars[var_base_name][cid] *= weight server_feed_dict = {} for shared_var in self.server.model_train.shared_vars: var_base_name = utils.get_base_name(shared_var) client_vars_as_array = np.array(client_vars[var_base_name]) placeholder = self.server.dict_update_placeholders[ var_base_name] server_feed_dict[placeholder] = client_vars_as_array sess.run(self.server.update_ops_shared, feed_dict=server_feed_dict) else: raise ValueError('Unknown vars update type: %s' % update_vars_type)
def main(): data = None ################################## # get stuff ################################## retriever = MetaDataRetriever(linksOutputFile=settings.LINKSFILE) data = retriever.scrape().save().getData() # dedupe, date filter and get back the data datadict = LinksProcessor(data, filterDate=settings.START_DATE).getData() # retrieve docs from net, convert them to text and cache the result docs = DocumentCache(datadict) datedict = {} ################################## # find stuff ################################## logger.info("Finding candidate lines...") for k in sorted(datadict.keys()): lines = docs.getDocumentLines(k) pat = [x for x in utils.mergeLines(lines, 3) if re.search(MAGIC_RE, x)] datepat = [ x for x in utils.mergeLines(lines, 3) if re.search(DATE_RE, x) ] datadict[k]['candidates'] = pat datedict[k] = {'candidates': datepat} matchdict = makeMatches(datadict) # as a kludge , committees are stored with id COMMITEE_ID_BASE+offset in identities.json # so we can separate the matches into types mksMatchesCnt = len([ x for x in matchdict.values() if int(x[0]['id']) < settings.COMMITEE_ID_BASE ]) commMatchesCnt = len(matchdict) - mksMatchesCnt logger.info("Located %d unique matches with score > %d (%d: mks, %d: committee) " %\ (len(matchdict), settings.SCORE_THRESHOLD, mksMatchesCnt, commMatchesCnt)) ################################## # save stuff ################################## matches = reduce(lambda x, y: x + y, matchdict.values()) not_matched = findNotMatched(datadict, matches) dump_json(not_matched, settings.NOMATCHESFILE) logger.info("saved details of documents with no matches as json in %s", settings.NOMATCHESFILE) dump_report(not_matched, settings.NO_MATCHES_HTML_FILE, settings.NO_MATCHES_TEMPLATE_FILE) dump_report(matches, settings.MATCHES_HTML_FILE, settings.MATCHES_TEMPLATE_FILE) cnt = 0 logger.info("finding committee session dates") matchesDict = {x['docid']: x for x in matches} for (k, v) in datedict.iteritems(): for line in v['candidates']: line = utils.reverse_nums( line) # text extraction reverses numbers, RTL thing # munge and contort to extract a valid date d = extract_date(utils.get_base_name(k), line) if d and matchesDict.get(d['docid']): cnt += 1 matchesDict[d['docid']]['comm_session_date'] = d[ 'date'].strftime("%d/%m/%Y") logger.info("updated %d documents with a committee session date" % cnt) # use the updated dict containing comm_session_date # for matches matches = matchesDict.values() logger.info("saved matches as json in %s", settings.MATCHESFILE) dump_json(matches, settings.MATCHESFILE) logger.info("saved matches as csv in %s", settings.MATCHES_CSV_FILE) # saves matches as csv file g = filter_keys(data_to_gen(settings.MATCHESFILE)) write_tsv(g, settings.MATCHES_CSV_FILE) # <-> short-circuit here to skip previous stages # load the matches back up with codecs.open(settings.MATCHESFILE, "r", encoding='utf-8') as f: matches = json.load(f) createRankings(matches) logger.info("saved rankings in %s", settings.COUNTS_CSVFILE) logger.info("Cheers.")
def __init__(self, name, data_generator, model_class, configs=None, id_=-1, initializer=None): self.name = name self.id = id_ self.data = data_generator(configs=configs, agent_id=id_) with tf.name_scope(utils.get_train_name_scope(name)): train_data = self.data.train_data_batch model_train = model_class(name, is_training=True, data=train_data, config=configs.train_config, initializer=initializer) with tf.name_scope(utils.get_validation_name_scope(name)): valid_data = self.data.validation_data_batch model_validation = model_class(name, is_training=False, data=valid_data, reuse=True, config=configs.train_config, initializer=initializer) with tf.name_scope(utils.get_test_name_scope(name)): test_data = self.data.test_data_batch model_test = model_class(name, is_training=False, data=test_data, reuse=True, config=configs.eval_config, initializer=initializer) self.model_train = model_train self.model_validation = model_validation self.model_test = model_test with tf.name_scope(utils.get_update_name_scope(self.name)): # One could use any of the three models in this update name scope, since # the vars are shared among them. update_ops_shared, placeholders_shared = utils.generate_update_ops( self.model_train.shared_vars) update_ops_personal, placeholders_personal = utils.generate_update_ops( self.model_train.personal_vars) update_ops_all = update_ops_shared + update_ops_personal # Merges two dicts of placeholders. placeholders_shared and # placeholders_personal should have no overlap keys. assert not set(placeholders_shared.keys()).intersection( placeholders_personal.keys()) dict_update_placeholders = {} dict_update_placeholders.update(placeholders_shared) dict_update_placeholders.update(placeholders_personal) self.update_ops_all = update_ops_all self.update_ops_shared = update_ops_shared self.dict_update_placeholders = dict_update_placeholders self.read_ops_all_vars = { k: v.value() for k, v in self.model_train.var_dict.items() } self.read_ops_shared_vars = { utils.get_base_name(v): v.value() for v in self.model_train.shared_vars }
def main(): data = None ################################## # get stuff ################################## retriever = MetaDataRetriever(linksOutputFile=settings.LINKSFILE) data=retriever.scrape().save().getData() # dedupe, date filter and get back the data datadict = LinksProcessor(data, filterDate=settings.START_DATE).getData() # retrieve docs from net, convert them to text and cache the result docs = DocumentCache(datadict) datedict = {} ################################## # find stuff ################################## logger.info("Finding candidate lines...") for k in sorted(datadict.keys()): lines = docs.getDocumentLines(k) pat = [x for x in utils.mergeLines(lines,3) if re.search(MAGIC_RE, x)] datepat = [x for x in utils.mergeLines(lines,3) if re.search(DATE_RE, x)] datadict[k]['candidates'] = pat datedict[k] = {'candidates': datepat} matchdict=makeMatches(datadict) # as a kludge , committees are stored with id COMMITEE_ID_BASE+offset in identities.json # so we can separate the matches into types mksMatchesCnt = len([x for x in matchdict.values() if int(x[0]['id']) < settings.COMMITEE_ID_BASE]) commMatchesCnt = len(matchdict) - mksMatchesCnt logger.info("Located %d unique matches with score > %d (%d: mks, %d: committee) " %\ (len(matchdict), settings.SCORE_THRESHOLD, mksMatchesCnt, commMatchesCnt)) ################################## # save stuff ################################## matches = reduce(lambda x, y: x + y, matchdict.values()) not_matched=findNotMatched(datadict,matches) dump_json(not_matched,settings.NOMATCHESFILE) logger.info("saved details of documents with no matches as json in %s", settings.NOMATCHESFILE) dump_report(not_matched,settings.NO_MATCHES_HTML_FILE,settings.NO_MATCHES_TEMPLATE_FILE) dump_report(matches,settings.MATCHES_HTML_FILE,settings.MATCHES_TEMPLATE_FILE) cnt=0 logger.info("finding committee session dates") matchesDict = {x['docid']: x for x in matches} for (k, v) in datedict.iteritems(): for line in v['candidates']: line = utils.reverse_nums(line) # text extraction reverses numbers, RTL thing # munge and contort to extract a valid date d = extract_date(utils.get_base_name(k), line) if d and matchesDict.get(d['docid']): cnt +=1 matchesDict[d['docid']]['comm_session_date'] = d['date'].strftime("%d/%m/%Y") logger.info("updated %d documents with a committee session date" % cnt) # use the updated dict containing comm_session_date # for matches matches = matchesDict.values() logger.info("saved matches as json in %s", settings.MATCHESFILE) dump_json(matches,settings.MATCHESFILE) logger.info("saved matches as csv in %s", settings.MATCHES_CSV_FILE) # saves matches as csv file g=filter_keys(data_to_gen(settings.MATCHESFILE)) write_tsv(g,settings.MATCHES_CSV_FILE) # <-> short-circuit here to skip previous stages # load the matches back up with codecs.open(settings.MATCHESFILE, "r", encoding='utf-8') as f: matches = json.load(f) createRankings(matches) logger.info("saved rankings in %s", settings.COUNTS_CSVFILE) logger.info("Cheers.")