def word2vec (conf): logger = LoggingUtil.init_logging (__file__) logger.info ("Creating Chemotext2 word embeddings from input: {0}".format (conf.input_dir)) sc = SparkUtil.get_spark_context (conf.spark_conf) article_paths = SUtil.get_article_paths (conf.input_dir) #[:20000] articles = sc.parallelize (article_paths, conf.spark_conf.parts). \ map (lambda p : SUtil.get_article (p)) logger.info ("Listed {0} input files".format (articles.count ())) conf.output_dir = conf.output_dir.replace ("file:", "") conf.output_dir = "file://{0}/w2v".format (conf.output_dir) return WordEmbed (sc, conf.output_dir, articles)
def make_binaries (article, L, R, threshold=8000): logger = LoggingUtil.init_logging (__file__) result = [] for l in L: for r in R: distance = abs(l.docPos - r.docPos) logger.debug ("Distance - {0}".format (distance)) if distance < threshold: if article.date.find ("--") > -1: article.date = "0-0-9000" binary = KinaseBinary (id = 0, L = l.word, R = r.word, docDist = abs(l.docPos - r.docPos), paraDist = abs(l.paraPos - r.paraPos), sentDist = abs(l.sentPos - r.sentPos), code = 0, fact = False, refs = [], pmid = article.id, date = SerUtil.parse_date (article.date), file_name = article.fileName) logger.info ("Binary: {0}".format (binary)) result.append (binary) return result
def find_before (pmid_date, facts): logger.info ("Join facts with the pmid->date map to find interactions noticed before published discovery.") ref_pmid_to_binary = facts.map (lambda r : ( r[1][1].pmid, r[1][0] ) ) # ( intact.REF[pmid] -> KinaseBinary ) # TEST. Add reference pmids with late dates. pmid_date = pmid_date.union (ref_pmid_to_binary.map (lambda r : ( r[0], SerUtil.parse_date ("1-1-2300") ))) before = ref_pmid_to_binary. \ join (pmid_date). \ map (lambda r : r[1][0].copy (ref_date = r[1][1]) ). \ filter (lambda k : k.date and k.ref_date and k.date < k.ref_date). \ distinct () return before
def __iter__(self): for file_name in self.files: if self.match (file_name): base = "{0}.json".format (os.path.basename (file_name)) article_path = os.path.join(self.input_dir, base) article = SUtil.get_article (article_path) if article is not None: # http://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize sentence_tokens = [ self.tokenizer.tokenize(s) for s in sent_tokenize (article.raw) ] sentences = [ self.gene_syn.make_canonical (s) for s in sentence_tokens ] #print (" sentences------------> {0}".format (sentences)) #sentences = [ self.tokenizer.tokenize(s) for p in article.paragraphs for s in p.sentences ] #sentences = [ s.split(' ') for p in article.paragraphs for s in p.sentences ] for s in sentences: yield s
def get_article_guesses (article): guesses = article.AB + article.BC + article.AC + article.BB skiplist = [ 'for', 'was', 'she', 'long' ] result = [] for g in guesses: if not g.L in skiplist and not g.R in skiplist: g.pmid = article.id try: date = SUtil.parse_date (article.date) if date: g.date = calendar.timegm (date.timetuple()) # print ("Parsed guess date -> {0}".format (g.date)) except: print ("No date parsed in {0} {1}".format (article.fileName, article.date)) traceback.print_exc () result.append ( ( make_key (g.L, g.R, g.pmid), Guesses.distance (g) ) ) return result
def evaluate (conf): logger = LoggingUtil.init_logging (__file__) logger.info ("Evaluating Chemotext2 output: {0}".format (conf.input_dir)) sc = SparkUtil.get_spark_context (conf.spark_conf) facts = Facts.get_facts (sc, conf.ctd_conf) pathway_facts = Facts.get_pathway_facts (sc, conf.ctd_conf) logger.info ("Loaded {0} facts".format (facts.count ())) articles = SUtil.get_article_paths (conf.input_dir) #[:200] logger.info ("Listed {0} input files".format (len(articles))) for slice_n in range (0, conf.slices): output_dir = os.path.join (conf.output_dir, "eval", "annotated", str(slice_n)) if os.path.exists (output_dir): logger.info ("Skipping existing directory {0}".format (output_dir)) else: logger.info ("Loading guesses") start = time.time () guesses, article_pmids = Guesses.get_guesses (sc, conf.input_dir, conf.spark_conf.parts, articles, conf.slices, slice_n) elapsed = round (time.time () - start, 2) count = guesses.count () logger.info ("Guesses[slice {0}]. {1} binaries in {2} seconds.".format (slice_n, count, elapsed)) pmids = sc.broadcast (article_pmids) start = time.time () pmid_date_map = None pmid_map_path = os.path.join ( os.path.dirname (conf.input_dir), "pmid", "pmid_date_2.json") # /projects/stars/var/chemotext/pmid/pmid_date_2.json print ("Loading pmid date map: {0}".format (pmid_map_path)) with open (pmid_map_path, "r") as stream: pmid_date_map = json.loads (stream.read ()) elapsed = round (time.time () - start, 2) print ("Read pmid date map in {0} seconds".format (elapsed)) if pmid_date_map is None: print ("Unable to load pmid date map") else: start = time.time () pmid_date_map_broadcast = sc.broadcast (pmid_date_map) annotated = Guesses.annotate (guesses, facts, pathway_facts, pmids, pmid_date_map_broadcast).cache () count = annotated.count () elapsed = round (time.time () - start, 2) logger.info ("Annotation[slice {0}]. {1} binaries in {2} seconds.".format (slice_n, count, elapsed)) logger.info ("Generating annotated output for " + output_dir) os.makedirs (output_dir) train = annotated. \ filter (lambda b : b is not None and is_training (b)) train.count () train = train.map (lambda b : json.dumps (b, cls=BinaryEncoder)) train.count () train_out_dir = os.path.join (output_dir, 'train') train.saveAsTextFile ("file://" + train_out_dir) print (" --> train: {0}".format (train_out_dir)) test = annotated. \ filter (lambda b : b is not None and not is_training (b)).\ map (lambda b : json.dumps (b, cls=BinaryEncoder)) test_out_dir = os.path.join (output_dir, 'test') test.saveAsTextFile ("file://" + test_out_dir) print (" --> test: {0}".format (test_out_dir)) ''' Save CSV ''' csv_output = "file://{0}".format (os.path.join (output_dir, "csv")) annotated. \ map (to_csv_row). \ saveAsTextFile (csv_output) print (" --> csv: {0}".format (csv_output)) ''' Concatenate all csvs into one big one ''' csv_dirs = os.path.join (conf.output_dir, "eval", "annotated") print ("scanning {0}".format (csv_dirs)) csv_files = [] for root, dirnames, filenames in os.walk (csv_dirs): for filename in fnmatch.filter(filenames, '*part-*'): if not "crc" in filename and "csv" in root: file_name = os.path.join(root, filename) csv_files.append (file_name) big_csv = os.path.join (conf.output_dir, "eval", "eval.csv") with open (big_csv, "w") as stream: stream.write ("#pubmed_id,pubmed_date_unix_epoch_time,pubmed_date_human_readable,binary_a_term,binary_b_term,paragraph_distance,sentence_distance,word_distance,flag_if_valid,time_until_verified,freq_sec_deriv\n") for f in csv_files: with open (f, "r") as in_csv: for line in in_csv: stream.write(line)
def get_article (article_path): article = EquivalentSet.get_article_equiv_set (SUtil.get_article (article_path)) return [] if not article else [ article ]