def augment (conf): ''' For now, this is a place to extend the data in the CSV with ... * Word2Vec derived cosine similarity * Second derivative of the frequency of mentions ''' sc = SparkUtil.get_spark_context (conf.spark_conf) conf.output_dir = conf.output_dir.replace ("file:", "") conf.output_dir = "file://{0}".format (conf.output_dir) groups = Evaluate.load_all (sc, conf). \ map (lambda b : ( simplekey(b), [ b ] ) ). \ reduceByKey (lambda x, y : x + y). \ mapValues (lambda b : freq_derivative (b)). \ flatMap (lambda x : x[1]) ''' for k, v in groups.collect (): for b in v: print (" frequency second derivative {0} => {1}".format (k, b.freq_sec_deriv)) ''' groups = groups.coalesce (1) output_file = os.path.join (conf.output_dir, "eval", "augment.csv") no_proto_output_file = output_file.replace ("file://", "") if os.path.exists (no_proto_output_file): print ("removing existing output file") shutil.rmtree (no_proto_output_file) groups.map(to_csv_row).saveAsTextFile (output_file)
def main (): """ Annotate model files with word embedding computed cosine similarity. """ parser = argparse.ArgumentParser() parser.add_argument("--host", help="Mesos master host") parser.add_argument("--name", help="Spark framework name") parser.add_argument("--input", help="Output directory for a Chemotext2 run.") parser.add_argument("--output", help="Output directory for evaluation.") parser.add_argument("--slices", help="Number of separate work chunks.") parser.add_argument("--parts", help="Number of partitions for the computation.") parser.add_argument("--venv", help="Path to Python virtual environment to use") args = parser.parse_args() conf = EvaluateConf ( spark_conf = SparkConf (host = args.host, venv = args.venv, framework_name = args.name, parts = int(args.parts)), input_dir = args.input.replace ("file://", ""), output_dir = args.output.replace ("file://", ""), slices = int(args.slices)) print ("Data home: {0}".format (args.input)) sc = SparkUtil.get_spark_context (conf.spark_conf) process_graphs (sc, conf.input_dir, conf.spark_conf.parts)
def main (): """ Tools for running word2vec on the corpus. """ parser = argparse.ArgumentParser() parser.add_argument("--host", help="Mesos master host") parser.add_argument("--name", help="Spark framework name") parser.add_argument("--input", help="Output directory for a Chemotext2 run.") parser.add_argument("--output", help="Output directory for evaluation.") parser.add_argument("--slices", help="Number of slices of files to iterate over.") parser.add_argument("--parts", help="Number of partitions for the computation.") parser.add_argument("--venv", help="Path to Python virtual environment to use") args = parser.parse_args() conf = EvaluateConf ( spark_conf = SparkConf (host = args.host, venv = args.venv, framework_name = args.name, parts = int(args.parts)), input_dir = args.input.replace ("file://", ""), output_dir = args.output.replace ("file://", ""), slices = int(args.slices)) root = os.path.dirname (conf.input_dir) model_dir = os.path.join (root, "w2v", "gensim") sc = SparkUtil.get_spark_context (conf.spark_conf) file_list = "/projects/stars/app/chemotext/filelist.json" # hgnc = os.path.join (os.path.basedir (conf.spark_conf.otput_dir), "HGNC", "HGNCGeneFamilyDataSet.csv") hgnc = os.path.join (conf.output_dir, "HGNC", "HGNCGeneSynonyms.csv") print ("hgnc: {0}".format (hgnc)) build_all_models (sc, conf.input_dir, file_list, model_dir, hgnc)
def word2vec (conf): logger = LoggingUtil.init_logging (__file__) logger.info ("Creating Chemotext2 word embeddings from input: {0}".format (conf.input_dir)) sc = SparkUtil.get_spark_context (conf.spark_conf) article_paths = SUtil.get_article_paths (conf.input_dir) #[:20000] articles = sc.parallelize (article_paths, conf.spark_conf.parts). \ map (lambda p : SUtil.get_article (p)) logger.info ("Listed {0} input files".format (articles.count ())) conf.output_dir = conf.output_dir.replace ("file:", "") conf.output_dir = "file://{0}/w2v".format (conf.output_dir) return WordEmbed (sc, conf.output_dir, articles)
def analyze_medline (conf): logger.info ("conf: {0}".format (conf)) sc = SparkUtil.get_spark_context (conf) #medline_conn = Medline (sc, conf.input_xml, use_mem_cache=True) # create_pmid_map (medline_conn) start = time.time() sqlContext = SQLContext (sc) #p = sqlContext.jsonFile("alluxio://stars-c0.edc.renci.org:19998/chemotext/pmid/pmid_date_2.json").collectAsMap () with open ("/projects/stars/var/chemotext/pmid/pmid_date_2.json", "r") as stream: p = json.loads (stream.read ()) elapsed = time.time() - start print ("TIME(load): ------------> {0}".format (elapsed)) broadcastPMID = sc.broadcast (p) spots = sc.parallelize ([ 1, 2, 3 ]) times = spots.map (lambda s : broadcastPMID.value [str(s)]) print times.collect ()
def execute (conf, home): sc = SparkUtil.get_spark_context (conf.spark_conf) data_lake = DataLake (sc, conf.data_lake_conf) kin2prot = data_lake.get_kin2prot () articles = data_lake.load_articles () vocabulary = data_lake.load_vocabulary (kin2prot) pmid_date = data_lake.load_pmid_date () # ( pmid -> date ) binaries = LitCrawl.find_interactions (sc, vocabulary, articles) facts = LitCrawl.find_facts (vocabulary, binaries) before = LitCrawl.find_before (pmid_date, facts) for m in before.collect (): logger.info ("Before-Ref-Date:> {0}".format (m)) embed = WordEmbed (sc, conf, articles) for w in vocabulary.A.collect (): for syn in embed.find_syn (w, radius=800): if "kinase" in syn or "p53" in syn: print " --[ {0} ]:syn>> {1}".format (w, syn)
def plot (conf): sc = SparkUtil.get_spark_context (conf.spark_conf) sqlContext = SQLContext(sc) # object unused but defines toDF() print ("Original: Output dir: {0}".format (conf.output_dir)) conf.output_dir = conf.output_dir.replace ("file:", "") conf.output_dir = "file://{0}/eval".format (conf.output_dir) print ("Output dir: {0}".format (conf.output_dir)) annotated = Evaluate.load_all (sc, conf) #.sample (False, 0.02) before = annotated. \ map (lambda b : ( simplekey(b), [ b ] ) ). \ reduceByKey (lambda x,y : x + y). \ mapValues (lambda x : filter (lambda v : v is not None, x)) print ("Got {0} before values".format (before.count ())) plot_path = conf.output_dir.replace ("file://", "") print ("Generating plots to plot path: {0}".format (plot_path)) Evaluate.plot_before (before, plot_path) before = None distances = annotated.map (lambda x : ( x.fact, x.docDist, x.paraDist, x.sentDist) ) Evaluate.plot_distances (distances)
def train_model (conf): sc = SparkUtil.get_spark_context (conf.spark_conf) conf.output_dir = conf.output_dir.replace ("file:", "") conf.output_dir = "file://{0}".format (conf.output_dir) labeled = Evaluate.load_all (sc, conf). \ map (lambda b : LabeledPoint ( label = 1.0 if b.fact else 0.0, features = [ b.paraDist, b.sentDist, b.docDist ] ) ) # labeled = sc.parallelize ([ round ((x/10) * 9) for x in random.sample(range(1, 100000000), 30000) ]). \ # map (lambda b : LabeledPoint ( 1.0 if b % 2 == 0 else 0.0, # [ b, b * 2, b * 9 ] ) ) # print (labeled.collect ()) train, test = labeled.randomSplit (weights=[ 0.8, 0.2 ], seed=12345) count = train.count () start = time.time () model = LogisticRegressionWithLBFGS.train (train) elapsed = time.time () - start print ("Trained model on training set of size {0} in {1} seconds".format (count, elapsed)) start = time.time () model_path = os.path.join (conf.output_dir, "eval", "model") file_path = model_path.replace ("file://", "") if os.path.isdir (file_path): print ("Removing existing model {0}".format (file_path)) shutil.rmtree (file_path) model.save(sc, model_path) sameModel = LogisticRegressionModel.load(sc, model_path) elapsed = time.time () - start print ("Saved and restored model to {0} in {1} seconds".format (model_path, elapsed)) # Metrics labelsAndPreds = test.map (lambda p: (p.label, model.predict (p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count () / float (train.count()) print("Training Error => {0}".format (trainErr)) predictionsAndLabels = labelsAndPreds.map (lambda x : ( float(x[1]), float(x[0]) )) metrics = MulticlassMetrics (predictionsAndLabels) print (" --------------> {0}".format (predictionsAndLabels.take (1000))) #print (labelsAndPreds.collect ()) print ("\nMETRICS:") try: print ("false positive (0.0): {0}".format (metrics.falsePositiveRate(0.0))) print ("false positive (1.0): {0}".format (metrics.falsePositiveRate(1.0))) except: traceback.print_exc () try: print ("precision : {0}".format (metrics.precision(1.0))) except: traceback.print_exc () try: print ("recall : {0}".format (metrics.recall(1.0))) except: traceback.print_exc () try: print ("fMeasure : {0}".format (metrics.fMeasure(0.0, 2.0))) except: traceback.print_exc () print ("confusion matrix : {0}".format (metrics.confusionMatrix().toArray ())) print ("precision : {0}".format (metrics.precision())) print ("recall : {0}".format (metrics.recall())) print ("weighted false pos : {0}".format (metrics.weightedFalsePositiveRate)) print ("weighted precision : {0}".format (metrics.weightedPrecision)) print ("weighted recall : {0}".format (metrics.weightedRecall)) print ("weight f measure : {0}".format (metrics.weightedFMeasure())) print ("weight f measure 2 : {0}".format (metrics.weightedFMeasure(2.0))) print ("") # Regression metrics predictedAndObserved = test.map (lambda p: (model.predict (p.features) / 1.0 , p.label / 1.0 ) ) regression_metrics = RegressionMetrics (predictedAndObserved) print ("explained variance......: {0}".format (regression_metrics.explainedVariance)) print ("absolute error..........: {0}".format (regression_metrics.meanAbsoluteError)) print ("mean squared error......: {0}".format (regression_metrics.meanSquaredError)) print ("root mean squared error.: {0}".format (regression_metrics.rootMeanSquaredError)) print ("r2......................: {0}".format (regression_metrics.r2)) print ("") labelsAndPreds = test.map (lambda p: (p.label, sameModel.predict (p.features))) testErr = labelsAndPreds.filter (lambda (v, p): v != p).count () / float (test.count ()) print ("Testing Error => {0}".format (testErr))
def evaluate (conf): logger = LoggingUtil.init_logging (__file__) logger.info ("Evaluating Chemotext2 output: {0}".format (conf.input_dir)) sc = SparkUtil.get_spark_context (conf.spark_conf) facts = Facts.get_facts (sc, conf.ctd_conf) pathway_facts = Facts.get_pathway_facts (sc, conf.ctd_conf) logger.info ("Loaded {0} facts".format (facts.count ())) articles = SUtil.get_article_paths (conf.input_dir) #[:200] logger.info ("Listed {0} input files".format (len(articles))) for slice_n in range (0, conf.slices): output_dir = os.path.join (conf.output_dir, "eval", "annotated", str(slice_n)) if os.path.exists (output_dir): logger.info ("Skipping existing directory {0}".format (output_dir)) else: logger.info ("Loading guesses") start = time.time () guesses, article_pmids = Guesses.get_guesses (sc, conf.input_dir, conf.spark_conf.parts, articles, conf.slices, slice_n) elapsed = round (time.time () - start, 2) count = guesses.count () logger.info ("Guesses[slice {0}]. {1} binaries in {2} seconds.".format (slice_n, count, elapsed)) pmids = sc.broadcast (article_pmids) start = time.time () pmid_date_map = None pmid_map_path = os.path.join ( os.path.dirname (conf.input_dir), "pmid", "pmid_date_2.json") # /projects/stars/var/chemotext/pmid/pmid_date_2.json print ("Loading pmid date map: {0}".format (pmid_map_path)) with open (pmid_map_path, "r") as stream: pmid_date_map = json.loads (stream.read ()) elapsed = round (time.time () - start, 2) print ("Read pmid date map in {0} seconds".format (elapsed)) if pmid_date_map is None: print ("Unable to load pmid date map") else: start = time.time () pmid_date_map_broadcast = sc.broadcast (pmid_date_map) annotated = Guesses.annotate (guesses, facts, pathway_facts, pmids, pmid_date_map_broadcast).cache () count = annotated.count () elapsed = round (time.time () - start, 2) logger.info ("Annotation[slice {0}]. {1} binaries in {2} seconds.".format (slice_n, count, elapsed)) logger.info ("Generating annotated output for " + output_dir) os.makedirs (output_dir) train = annotated. \ filter (lambda b : b is not None and is_training (b)) train.count () train = train.map (lambda b : json.dumps (b, cls=BinaryEncoder)) train.count () train_out_dir = os.path.join (output_dir, 'train') train.saveAsTextFile ("file://" + train_out_dir) print (" --> train: {0}".format (train_out_dir)) test = annotated. \ filter (lambda b : b is not None and not is_training (b)).\ map (lambda b : json.dumps (b, cls=BinaryEncoder)) test_out_dir = os.path.join (output_dir, 'test') test.saveAsTextFile ("file://" + test_out_dir) print (" --> test: {0}".format (test_out_dir)) ''' Save CSV ''' csv_output = "file://{0}".format (os.path.join (output_dir, "csv")) annotated. \ map (to_csv_row). \ saveAsTextFile (csv_output) print (" --> csv: {0}".format (csv_output)) ''' Concatenate all csvs into one big one ''' csv_dirs = os.path.join (conf.output_dir, "eval", "annotated") print ("scanning {0}".format (csv_dirs)) csv_files = [] for root, dirnames, filenames in os.walk (csv_dirs): for filename in fnmatch.filter(filenames, '*part-*'): if not "crc" in filename and "csv" in root: file_name = os.path.join(root, filename) csv_files.append (file_name) big_csv = os.path.join (conf.output_dir, "eval", "eval.csv") with open (big_csv, "w") as stream: stream.write ("#pubmed_id,pubmed_date_unix_epoch_time,pubmed_date_human_readable,binary_a_term,binary_b_term,paragraph_distance,sentence_distance,word_distance,flag_if_valid,time_until_verified,freq_sec_deriv\n") for f in csv_files: with open (f, "r") as in_csv: for line in in_csv: stream.write(line)