def make_binaries (article, L, R, threshold=8000): logger = LoggingUtil.init_logging (__file__) result = [] for l in L: for r in R: distance = abs(l.docPos - r.docPos) logger.debug ("Distance - {0}".format (distance)) if distance < threshold: if article.date.find ("--") > -1: article.date = "0-0-9000" binary = KinaseBinary (id = 0, L = l.word, R = r.word, docDist = abs(l.docPos - r.docPos), paraDist = abs(l.paraPos - r.paraPos), sentDist = abs(l.sentPos - r.sentPos), code = 0, fact = False, refs = [], pmid = article.id, date = SerUtil.parse_date (article.date), file_name = article.fileName) logger.info ("Binary: {0}".format (binary)) result.append (binary) return result
def plot_before (before, output_dir): from chemotext_util import LoggingUtil logger = LoggingUtil.init_logging (__file__) if before.count () <= 0: return ''' before = before.reduceByKey (lambda x, y : x + y). \ mapValues (lambda x : filter (lambda v : v is not None, x)) ''' true_plots_dir = os.path.join (output_dir, "chart") print ("-------------> before 0") true_mentions = before. \ mapValues (lambda x : Plot.plot_true_mentions (x, true_plots_dir)). \ filter (lambda x : len(x[1]) > 0) logger.info ("Plotted {0} sets of true mentions.".format (true_mentions.count ())) print ("-------------> before 1") false_plots_dir = os.path.join (output_dir, "chart", "false") false_mentions = before.subtractByKey (true_mentions) false_frequency = false_mentions. \ mapValues (lambda x : Plot.false_mention_histogram (x, false_plots_dir)) false_mentions = false_mentions. \ mapValues (lambda x : Plot.plot_false_mentions (x, false_plots_dir)) logger.info ("false mentions: {0}".format (false_mentions.count ())) true_mentions = None false_mentions = None false_frequency = None
def lexer (article, terms): logger = LoggingUtil.init_logging (__file__) logger.debug ("Parsing @ {0}.json".format (article.fileName)) result = [] doc_pos = 0 para_pos = 0 sent_pos = 0 for para in article.paragraphs: for sentence in para.sentences: sentence = sentence.replace (".", " ") for term in terms.value: if not term or len(term) < 3: continue pos = sentence.find (term) if " " in term else sentence.find (" %s " % term) if pos > -1: result.append (WordPosition (word = term, docPos = doc_pos + pos, paraPos = para_pos, sentPos = sent_pos)) sent_pos = sent_pos + 1 doc_pos = doc_pos + len (sentence) para_pos = para_pos + 1 for r in result: logger.info ("word: {0} file: {1}".format (r, article.fileName)) return result
def word2vec (conf): logger = LoggingUtil.init_logging (__file__) logger.info ("Creating Chemotext2 word embeddings from input: {0}".format (conf.input_dir)) sc = SparkUtil.get_spark_context (conf.spark_conf) article_paths = SUtil.get_article_paths (conf.input_dir) #[:20000] articles = sc.parallelize (article_paths, conf.spark_conf.parts). \ map (lambda p : SUtil.get_article (p)) logger.info ("Listed {0} input files".format (articles.count ())) conf.output_dir = conf.output_dir.replace ("file:", "") conf.output_dir = "file://{0}/w2v".format (conf.output_dir) return WordEmbed (sc, conf.output_dir, articles)
def get_guesses (sc, input_dir, partitions, articles, slices=1, slice_n=1): from chemotext_util import LoggingUtil logger = LoggingUtil.init_logging (__file__) slice_size = int (len (articles) / slices) offset = slice_size * slice_n rest = len(articles) - offset if rest > slice_size and rest <= 2 * slice_size: slice_size = rest the_slice = articles [ offset : offset + slice_size ] logger.info (" -- Guesses (input:{0}, articles:{1}, slice_size:{2}, offset:{3})". format (input_dir, len(articles), slice_size, offset)) articles = sc.parallelize (the_slice, partitions). \ flatMap (lambda p : EquivalentSet.get_article (p)).\ sample (False, debug_scale).\ cache () return ( articles.flatMap (Guesses.get_article_guesses).cache (), articles.map (lambda a : a.id).collect () )
import random import re import shutil import time import traceback from chemotext_util import SparkConf from chemotext_util import SparkUtil from chemotext_util import EvaluateConf from chemotext_util import LoggingUtil from datetime import date from graphframes import GraphFrame from rdflib import Graph from pyspark.sql import Row from pyspark.sql import SQLContext logger = LoggingUtil.init_logging (__file__) def trim_uri (u): result = u if u.startswith ("<http://"): s = u.replace('>', '').split ('/') n = len (s) result ='/'.join (s[n-2:n]) return result def process_graphs (sc, in_dir, partitions): """ Read graph vertices and edges from disk if already saved. Otherwise, Read chem2bio2rdf drugbank, pubchem, and other N3 RDF models. Save vertices and edges to disk.
def evaluate (conf): logger = LoggingUtil.init_logging (__file__) logger.info ("Evaluating Chemotext2 output: {0}".format (conf.input_dir)) sc = SparkUtil.get_spark_context (conf.spark_conf) facts = Facts.get_facts (sc, conf.ctd_conf) pathway_facts = Facts.get_pathway_facts (sc, conf.ctd_conf) logger.info ("Loaded {0} facts".format (facts.count ())) articles = SUtil.get_article_paths (conf.input_dir) #[:200] logger.info ("Listed {0} input files".format (len(articles))) for slice_n in range (0, conf.slices): output_dir = os.path.join (conf.output_dir, "eval", "annotated", str(slice_n)) if os.path.exists (output_dir): logger.info ("Skipping existing directory {0}".format (output_dir)) else: logger.info ("Loading guesses") start = time.time () guesses, article_pmids = Guesses.get_guesses (sc, conf.input_dir, conf.spark_conf.parts, articles, conf.slices, slice_n) elapsed = round (time.time () - start, 2) count = guesses.count () logger.info ("Guesses[slice {0}]. {1} binaries in {2} seconds.".format (slice_n, count, elapsed)) pmids = sc.broadcast (article_pmids) start = time.time () pmid_date_map = None pmid_map_path = os.path.join ( os.path.dirname (conf.input_dir), "pmid", "pmid_date_2.json") # /projects/stars/var/chemotext/pmid/pmid_date_2.json print ("Loading pmid date map: {0}".format (pmid_map_path)) with open (pmid_map_path, "r") as stream: pmid_date_map = json.loads (stream.read ()) elapsed = round (time.time () - start, 2) print ("Read pmid date map in {0} seconds".format (elapsed)) if pmid_date_map is None: print ("Unable to load pmid date map") else: start = time.time () pmid_date_map_broadcast = sc.broadcast (pmid_date_map) annotated = Guesses.annotate (guesses, facts, pathway_facts, pmids, pmid_date_map_broadcast).cache () count = annotated.count () elapsed = round (time.time () - start, 2) logger.info ("Annotation[slice {0}]. {1} binaries in {2} seconds.".format (slice_n, count, elapsed)) logger.info ("Generating annotated output for " + output_dir) os.makedirs (output_dir) train = annotated. \ filter (lambda b : b is not None and is_training (b)) train.count () train = train.map (lambda b : json.dumps (b, cls=BinaryEncoder)) train.count () train_out_dir = os.path.join (output_dir, 'train') train.saveAsTextFile ("file://" + train_out_dir) print (" --> train: {0}".format (train_out_dir)) test = annotated. \ filter (lambda b : b is not None and not is_training (b)).\ map (lambda b : json.dumps (b, cls=BinaryEncoder)) test_out_dir = os.path.join (output_dir, 'test') test.saveAsTextFile ("file://" + test_out_dir) print (" --> test: {0}".format (test_out_dir)) ''' Save CSV ''' csv_output = "file://{0}".format (os.path.join (output_dir, "csv")) annotated. \ map (to_csv_row). \ saveAsTextFile (csv_output) print (" --> csv: {0}".format (csv_output)) ''' Concatenate all csvs into one big one ''' csv_dirs = os.path.join (conf.output_dir, "eval", "annotated") print ("scanning {0}".format (csv_dirs)) csv_files = [] for root, dirnames, filenames in os.walk (csv_dirs): for filename in fnmatch.filter(filenames, '*part-*'): if not "crc" in filename and "csv" in root: file_name = os.path.join(root, filename) csv_files.append (file_name) big_csv = os.path.join (conf.output_dir, "eval", "eval.csv") with open (big_csv, "w") as stream: stream.write ("#pubmed_id,pubmed_date_unix_epoch_time,pubmed_date_human_readable,binary_a_term,binary_b_term,paragraph_distance,sentence_distance,word_distance,flag_if_valid,time_until_verified,freq_sec_deriv\n") for f in csv_files: with open (f, "r") as in_csv: for line in in_csv: stream.write(line)
def trace_set (trace_level, label, rdd): logger = LoggingUtil.init_logging (__file__) if (logger.getEffectiveLevel() > trace_level): for g in rdd.collect (): print (" {0}> {1}->{2}".format (label, g[0], g[1]))
def make_equiv_set (L, R, threshold=800): logger = LoggingUtil.init_logging (__file__) pairs = {} # Create all possible pairs for left in L: if left.word in skiplist: continue for right in R: if right.word in skiplist or right.word is left.word: continue docDist = abs (left.docPos - right.docPos) if docDist < threshold: key = "{0}@{1}".format (left.word, right.word) binary = Binary ( id = 0, L = left.word, R = right.word, docDist = docDist, sentDist = abs ( left.sentPos - right.sentPos ), paraDist = abs ( left.paraPos - right.paraPos ), code = 1, fact = False, refs = [], leftDocPos = left.docPos, rightDocPos = right.docPos) if key in pairs: pairs[key].append ( EqBinary (binary, left, right) ) else: pairs[key] = [ EqBinary (binary, left, right) ] if log_trace: print ("") for k,v in pairs.iteritems (): for val in v: print (" --: [{0}] -> [{1}]".format (k, val)) # GroupBy (x,y) REk = [] for key in pairs: REk_key = [] if log_trace: print ("key: {0}".format (key)) Ek = pairs[key] original_length = len (Ek) # Sort Ek.sort (key = lambda p: p.binary.docDist) while len(Ek) > 0: EquivalentSet.log_sorted (Ek) canonical = Ek[0] if log_trace: print (" 2. canonical: {0}".format (canonical)) # Min distance pair REk_key.append (canonical.binary) old = Ek Ek = [ e for e in Ek if not (e.L.docPos == canonical.L.docPos or e.R.docPos == canonical.R.docPos) ] if log_trace: for o in old: if o not in Ek: print " 3. discard {0}".format (o) EquivalentSet.log_reduced_set (REk_key, original_length, key) REk = REk + REk_key if log_trace: print ("REk: {0}".format (REk)) return REk