Example #1
0
 def word2vec (conf):
     logger = LoggingUtil.init_logging (__file__)
     logger.info ("Creating Chemotext2 word embeddings from input: {0}".format (conf.input_dir))
     sc = SparkUtil.get_spark_context (conf.spark_conf)
     article_paths = SUtil.get_article_paths (conf.input_dir) #[:20000]
     articles = sc.parallelize (article_paths, conf.spark_conf.parts). \
                map (lambda p : SUtil.get_article (p))
     logger.info ("Listed {0} input files".format (articles.count ()))
     
     conf.output_dir = conf.output_dir.replace ("file:", "")
     conf.output_dir = "file://{0}/w2v".format (conf.output_dir)
     return WordEmbed (sc, conf.output_dir, articles)
Example #2
0
 def make_binaries (article, L, R, threshold=8000):
     logger = LoggingUtil.init_logging (__file__)
     result = []
     for l in L:
         for r in R:
             distance = abs(l.docPos - r.docPos)
             logger.debug ("Distance - {0}".format (distance))
             if distance < threshold:
                 if article.date.find ("--") > -1:
                     article.date = "0-0-9000"
                 binary = KinaseBinary (id = 0,
                                        L = l.word,
                                        R = r.word,
                                        docDist  = abs(l.docPos - r.docPos),
                                        paraDist = abs(l.paraPos - r.paraPos),
                                        sentDist = abs(l.sentPos - r.sentPos),
                                        code = 0,
                                        fact = False,
                                        refs = [],
                                        pmid = article.id,
                                        date = SerUtil.parse_date (article.date),
                                        file_name = article.fileName)
                 logger.info ("Binary: {0}".format (binary))
                 result.append (binary)
     return result
Example #3
0
 def find_before (pmid_date, facts):
     logger.info ("Join facts with the pmid->date map to find interactions noticed before published discovery.")
     ref_pmid_to_binary = facts.map (lambda r : ( r[1][1].pmid, r[1][0] ) ) # ( intact.REF[pmid] -> KinaseBinary )
     # TEST. Add reference pmids with late dates.
     pmid_date = pmid_date.union (ref_pmid_to_binary.map (lambda r : ( r[0], SerUtil.parse_date ("1-1-2300") )))
     before = ref_pmid_to_binary.                                                \
              join (pmid_date).                                                  \
              map (lambda r : r[1][0].copy (ref_date = r[1][1]) ).               \
              filter (lambda k : k.date and k.ref_date and k.date < k.ref_date). \
              distinct ()
     return before
Example #4
0
 def __iter__(self):
     for file_name in self.files:
         if self.match (file_name):
             base = "{0}.json".format (os.path.basename (file_name))
             article_path = os.path.join(self.input_dir, base)
             article = SUtil.get_article (article_path)
             if article is not None:
                 # http://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize
                 sentence_tokens = [ self.tokenizer.tokenize(s) for s in sent_tokenize (article.raw) ]
                 sentences = [ self.gene_syn.make_canonical (s) for s in sentence_tokens ]
                 #print (" sentences------------> {0}".format (sentences))
                 #sentences = [ self.tokenizer.tokenize(s) for p in article.paragraphs for s in p.sentences ]
                 #sentences = [ s.split(' ') for p in article.paragraphs for s in p.sentences ]
                 for s in sentences:
                     yield s
Example #5
0
    def get_article_guesses (article):
        guesses = article.AB + article.BC + article.AC + article.BB
        skiplist = [ 'for', 'was', 'she', 'long' ]
        result = []
        for g in guesses:
            if not g.L in skiplist and not g.R in skiplist:
                g.pmid = article.id
                try:
                    date = SUtil.parse_date (article.date)
                    if date:
                        g.date = calendar.timegm (date.timetuple())
#                        print ("Parsed guess date -> {0}".format (g.date))
                except:
                    print ("No date parsed in {0} {1}".format (article.fileName, article.date))
                    traceback.print_exc ()
                result.append ( ( make_key (g.L, g.R, g.pmid), Guesses.distance (g) ) )
        return result
Example #6
0
    def evaluate (conf):
        logger = LoggingUtil.init_logging (__file__)
        logger.info ("Evaluating Chemotext2 output: {0}".format (conf.input_dir))
        sc = SparkUtil.get_spark_context (conf.spark_conf)
        facts = Facts.get_facts (sc, conf.ctd_conf)
        pathway_facts = Facts.get_pathway_facts (sc, conf.ctd_conf)
        logger.info ("Loaded {0} facts".format (facts.count ()))
        articles = SUtil.get_article_paths (conf.input_dir) #[:200]
        logger.info ("Listed {0} input files".format (len(articles)))
        for slice_n in range (0, conf.slices):
            output_dir = os.path.join (conf.output_dir, "eval", "annotated", str(slice_n))
            if os.path.exists (output_dir):
                logger.info ("Skipping existing directory {0}".format (output_dir))
            else:
                logger.info ("Loading guesses")
                start = time.time ()
                guesses, article_pmids = Guesses.get_guesses (sc,
                                                              conf.input_dir,
                                                              conf.spark_conf.parts,
                                                              articles,
                                                              conf.slices,
                                                          slice_n)
                elapsed = round (time.time () - start, 2)
                count = guesses.count ()
                logger.info ("Guesses[slice {0}]. {1} binaries in {2} seconds.".format (slice_n, count, elapsed))
                
                pmids = sc.broadcast (article_pmids)

                start = time.time ()
                pmid_date_map = None
                pmid_map_path = os.path.join ( os.path.dirname (conf.input_dir), "pmid", "pmid_date_2.json")
                # /projects/stars/var/chemotext/pmid/pmid_date_2.json

                print ("Loading pmid date map: {0}".format (pmid_map_path))
                with open (pmid_map_path, "r") as stream:
                    pmid_date_map = json.loads (stream.read ())
                elapsed = round (time.time () - start, 2)
                print ("Read pmid date map in {0} seconds".format (elapsed))

                if pmid_date_map is None:
                    print ("Unable to load pmid date map")
                else:
                    start = time.time ()
                    pmid_date_map_broadcast = sc.broadcast (pmid_date_map)
                    annotated = Guesses.annotate (guesses, facts, pathway_facts, pmids, pmid_date_map_broadcast).cache ()
                    count = annotated.count ()
                    elapsed = round (time.time () - start, 2)
                
                logger.info ("Annotation[slice {0}]. {1} binaries in {2} seconds.".format (slice_n, count, elapsed))
                logger.info ("Generating annotated output for " + output_dir)
                os.makedirs (output_dir)


                train = annotated. \
                        filter (lambda b : b is not None and is_training (b))
                train.count ()

                train = train.map (lambda b : json.dumps (b, cls=BinaryEncoder))
                train.count ()
                train_out_dir = os.path.join (output_dir, 'train')
                train.saveAsTextFile ("file://" + train_out_dir)
                print ("   --> train: {0}".format (train_out_dir))
                
                test  = annotated. \
                        filter (lambda b : b is not None and not is_training (b)).\
                        map (lambda b : json.dumps (b, cls=BinaryEncoder))
                test_out_dir = os.path.join (output_dir, 'test')
                test.saveAsTextFile ("file://" + test_out_dir)
                print ("   --> test: {0}".format (test_out_dir))

                ''' Save CSV '''
                csv_output = "file://{0}".format (os.path.join (output_dir, "csv"))                
                annotated. \
                    map (to_csv_row). \
                    saveAsTextFile (csv_output)
                print ("   --> csv: {0}".format (csv_output))

        ''' Concatenate all csvs into one big one '''
        csv_dirs = os.path.join (conf.output_dir, "eval", "annotated")
        print ("scanning {0}".format (csv_dirs))
        csv_files = []
        for root, dirnames, filenames in os.walk (csv_dirs):
            for filename in fnmatch.filter(filenames, '*part-*'):
                if not "crc" in filename and "csv" in root:
                    file_name = os.path.join(root, filename)
                    csv_files.append (file_name)
        big_csv = os.path.join (conf.output_dir, "eval", "eval.csv")
        
        with open (big_csv, "w") as stream:
            stream.write ("#pubmed_id,pubmed_date_unix_epoch_time,pubmed_date_human_readable,binary_a_term,binary_b_term,paragraph_distance,sentence_distance,word_distance,flag_if_valid,time_until_verified,freq_sec_deriv\n")
            for f in csv_files:
                with open (f, "r") as in_csv:
                    for line in in_csv:
                        stream.write(line)
Example #7
0
 def get_article (article_path):
     article = EquivalentSet.get_article_equiv_set (SUtil.get_article (article_path))
     return [] if not article else [ article ]