def word2vec (conf): logger = LoggingUtil.init_logging (__file__) logger.info ("Creating Chemotext2 word embeddings from input: {0}".format (conf.input_dir)) sc = SparkUtil.get_spark_context (conf.spark_conf) article_paths = SUtil.get_article_paths (conf.input_dir) #[:20000] articles = sc.parallelize (article_paths, conf.spark_conf.parts). \ map (lambda p : SUtil.get_article (p)) logger.info ("Listed {0} input files".format (articles.count ())) conf.output_dir = conf.output_dir.replace ("file:", "") conf.output_dir = "file://{0}/w2v".format (conf.output_dir) return WordEmbed (sc, conf.output_dir, articles)
def evaluate (conf): logger = LoggingUtil.init_logging (__file__) logger.info ("Evaluating Chemotext2 output: {0}".format (conf.input_dir)) sc = SparkUtil.get_spark_context (conf.spark_conf) facts = Facts.get_facts (sc, conf.ctd_conf) pathway_facts = Facts.get_pathway_facts (sc, conf.ctd_conf) logger.info ("Loaded {0} facts".format (facts.count ())) articles = SUtil.get_article_paths (conf.input_dir) #[:200] logger.info ("Listed {0} input files".format (len(articles))) for slice_n in range (0, conf.slices): output_dir = os.path.join (conf.output_dir, "eval", "annotated", str(slice_n)) if os.path.exists (output_dir): logger.info ("Skipping existing directory {0}".format (output_dir)) else: logger.info ("Loading guesses") start = time.time () guesses, article_pmids = Guesses.get_guesses (sc, conf.input_dir, conf.spark_conf.parts, articles, conf.slices, slice_n) elapsed = round (time.time () - start, 2) count = guesses.count () logger.info ("Guesses[slice {0}]. {1} binaries in {2} seconds.".format (slice_n, count, elapsed)) pmids = sc.broadcast (article_pmids) start = time.time () pmid_date_map = None pmid_map_path = os.path.join ( os.path.dirname (conf.input_dir), "pmid", "pmid_date_2.json") # /projects/stars/var/chemotext/pmid/pmid_date_2.json print ("Loading pmid date map: {0}".format (pmid_map_path)) with open (pmid_map_path, "r") as stream: pmid_date_map = json.loads (stream.read ()) elapsed = round (time.time () - start, 2) print ("Read pmid date map in {0} seconds".format (elapsed)) if pmid_date_map is None: print ("Unable to load pmid date map") else: start = time.time () pmid_date_map_broadcast = sc.broadcast (pmid_date_map) annotated = Guesses.annotate (guesses, facts, pathway_facts, pmids, pmid_date_map_broadcast).cache () count = annotated.count () elapsed = round (time.time () - start, 2) logger.info ("Annotation[slice {0}]. {1} binaries in {2} seconds.".format (slice_n, count, elapsed)) logger.info ("Generating annotated output for " + output_dir) os.makedirs (output_dir) train = annotated. \ filter (lambda b : b is not None and is_training (b)) train.count () train = train.map (lambda b : json.dumps (b, cls=BinaryEncoder)) train.count () train_out_dir = os.path.join (output_dir, 'train') train.saveAsTextFile ("file://" + train_out_dir) print (" --> train: {0}".format (train_out_dir)) test = annotated. \ filter (lambda b : b is not None and not is_training (b)).\ map (lambda b : json.dumps (b, cls=BinaryEncoder)) test_out_dir = os.path.join (output_dir, 'test') test.saveAsTextFile ("file://" + test_out_dir) print (" --> test: {0}".format (test_out_dir)) ''' Save CSV ''' csv_output = "file://{0}".format (os.path.join (output_dir, "csv")) annotated. \ map (to_csv_row). \ saveAsTextFile (csv_output) print (" --> csv: {0}".format (csv_output)) ''' Concatenate all csvs into one big one ''' csv_dirs = os.path.join (conf.output_dir, "eval", "annotated") print ("scanning {0}".format (csv_dirs)) csv_files = [] for root, dirnames, filenames in os.walk (csv_dirs): for filename in fnmatch.filter(filenames, '*part-*'): if not "crc" in filename and "csv" in root: file_name = os.path.join(root, filename) csv_files.append (file_name) big_csv = os.path.join (conf.output_dir, "eval", "eval.csv") with open (big_csv, "w") as stream: stream.write ("#pubmed_id,pubmed_date_unix_epoch_time,pubmed_date_human_readable,binary_a_term,binary_b_term,paragraph_distance,sentence_distance,word_distance,flag_if_valid,time_until_verified,freq_sec_deriv\n") for f in csv_files: with open (f, "r") as in_csv: for line in in_csv: stream.write(line)