Esempio n. 1
0
 def make_binaries (article, L, R, threshold=8000):
     logger = LoggingUtil.init_logging (__file__)
     result = []
     for l in L:
         for r in R:
             distance = abs(l.docPos - r.docPos)
             logger.debug ("Distance - {0}".format (distance))
             if distance < threshold:
                 if article.date.find ("--") > -1:
                     article.date = "0-0-9000"
                 binary = KinaseBinary (id = 0,
                                        L = l.word,
                                        R = r.word,
                                        docDist  = abs(l.docPos - r.docPos),
                                        paraDist = abs(l.paraPos - r.paraPos),
                                        sentDist = abs(l.sentPos - r.sentPos),
                                        code = 0,
                                        fact = False,
                                        refs = [],
                                        pmid = article.id,
                                        date = SerUtil.parse_date (article.date),
                                        file_name = article.fileName)
                 logger.info ("Binary: {0}".format (binary))
                 result.append (binary)
     return result
Esempio n. 2
0
    def plot_before (before, output_dir):
        from chemotext_util import LoggingUtil
        logger = LoggingUtil.init_logging (__file__)

        if before.count () <= 0:
            return
        '''
        before = before.reduceByKey (lambda x, y : x + y). \
                 mapValues (lambda x : filter (lambda v : v is not None, x))
        '''
        true_plots_dir = os.path.join (output_dir, "chart")
        print ("-------------> before 0")
        true_mentions = before. \
                        mapValues (lambda x : Plot.plot_true_mentions (x, true_plots_dir)). \
                        filter (lambda x : len(x[1]) > 0)
        logger.info ("Plotted {0} sets of true mentions.".format (true_mentions.count ()))
        print ("-------------> before 1")

        false_plots_dir = os.path.join (output_dir, "chart", "false")
        false_mentions = before.subtractByKey (true_mentions)
        false_frequency = false_mentions. \
                          mapValues   (lambda x : Plot.false_mention_histogram (x, false_plots_dir))

        false_mentions = false_mentions. \
                         mapValues   (lambda x : Plot.plot_false_mentions (x, false_plots_dir))

        logger.info ("false mentions: {0}".format (false_mentions.count ()))
        true_mentions = None
        false_mentions = None
        false_frequency = None
Esempio n. 3
0
 def lexer (article, terms):
     logger = LoggingUtil.init_logging (__file__)
     logger.debug ("Parsing @ {0}.json".format (article.fileName))
     result = []
     doc_pos = 0
     para_pos = 0
     sent_pos = 0
     for para in article.paragraphs:
         for sentence in para.sentences:
             sentence = sentence.replace (".", " ")
             for term in terms.value:
                 if not term or len(term) < 3:
                     continue
                 pos = sentence.find (term) if " " in term else sentence.find (" %s " % term)
                 if pos > -1:
                     result.append (WordPosition (word    = term,
                                                  docPos  = doc_pos + pos,
                                                  paraPos = para_pos,
                                                  sentPos = sent_pos))
             sent_pos = sent_pos + 1
             doc_pos = doc_pos + len (sentence)
         para_pos = para_pos + 1
     for r in result:
         logger.info ("word: {0} file: {1}".format (r, article.fileName))
     return result
Esempio n. 4
0
 def word2vec (conf):
     logger = LoggingUtil.init_logging (__file__)
     logger.info ("Creating Chemotext2 word embeddings from input: {0}".format (conf.input_dir))
     sc = SparkUtil.get_spark_context (conf.spark_conf)
     article_paths = SUtil.get_article_paths (conf.input_dir) #[:20000]
     articles = sc.parallelize (article_paths, conf.spark_conf.parts). \
                map (lambda p : SUtil.get_article (p))
     logger.info ("Listed {0} input files".format (articles.count ()))
     
     conf.output_dir = conf.output_dir.replace ("file:", "")
     conf.output_dir = "file://{0}/w2v".format (conf.output_dir)
     return WordEmbed (sc, conf.output_dir, articles)
Esempio n. 5
0
 def get_guesses (sc, input_dir, partitions, articles, slices=1, slice_n=1):
     from chemotext_util import LoggingUtil
     logger = LoggingUtil.init_logging (__file__)
     slice_size = int (len (articles) / slices)
     offset = slice_size * slice_n
     rest = len(articles) - offset
     if rest > slice_size and rest <= 2 * slice_size:
         slice_size = rest
     the_slice = articles [ offset : offset + slice_size ]
     logger.info ("   -- Guesses (input:{0}, articles:{1}, slice_size:{2}, offset:{3})".
                  format (input_dir, len(articles), slice_size, offset)) 
     articles = sc.parallelize (the_slice, partitions).  \
                flatMap (lambda p : EquivalentSet.get_article (p)).\
                sample (False, debug_scale).\
                cache ()
     return (
         articles.flatMap (Guesses.get_article_guesses).cache (),
         articles.map (lambda a : a.id).collect ()
     )
Esempio n. 6
0
import random
import re
import shutil
import time
import traceback
from chemotext_util import SparkConf
from chemotext_util import SparkUtil
from chemotext_util import EvaluateConf
from chemotext_util import LoggingUtil
from datetime import date
from graphframes import GraphFrame
from rdflib import Graph
from pyspark.sql import Row
from pyspark.sql import SQLContext

logger = LoggingUtil.init_logging (__file__)

def trim_uri (u):
    result = u
    if u.startswith ("<http://"):
        s = u.replace('>', '').split ('/')
        n = len (s)
        result ='/'.join (s[n-2:n])
    return result

def process_graphs (sc, in_dir, partitions):
    """
    Read graph vertices and edges from disk if already saved.
    Otherwise,
    Read chem2bio2rdf drugbank, pubchem, and other N3 RDF models.
    Save vertices and edges to disk.
Esempio n. 7
0
    def evaluate (conf):
        logger = LoggingUtil.init_logging (__file__)
        logger.info ("Evaluating Chemotext2 output: {0}".format (conf.input_dir))
        sc = SparkUtil.get_spark_context (conf.spark_conf)
        facts = Facts.get_facts (sc, conf.ctd_conf)
        pathway_facts = Facts.get_pathway_facts (sc, conf.ctd_conf)
        logger.info ("Loaded {0} facts".format (facts.count ()))
        articles = SUtil.get_article_paths (conf.input_dir) #[:200]
        logger.info ("Listed {0} input files".format (len(articles)))
        for slice_n in range (0, conf.slices):
            output_dir = os.path.join (conf.output_dir, "eval", "annotated", str(slice_n))
            if os.path.exists (output_dir):
                logger.info ("Skipping existing directory {0}".format (output_dir))
            else:
                logger.info ("Loading guesses")
                start = time.time ()
                guesses, article_pmids = Guesses.get_guesses (sc,
                                                              conf.input_dir,
                                                              conf.spark_conf.parts,
                                                              articles,
                                                              conf.slices,
                                                          slice_n)
                elapsed = round (time.time () - start, 2)
                count = guesses.count ()
                logger.info ("Guesses[slice {0}]. {1} binaries in {2} seconds.".format (slice_n, count, elapsed))
                
                pmids = sc.broadcast (article_pmids)

                start = time.time ()
                pmid_date_map = None
                pmid_map_path = os.path.join ( os.path.dirname (conf.input_dir), "pmid", "pmid_date_2.json")
                # /projects/stars/var/chemotext/pmid/pmid_date_2.json

                print ("Loading pmid date map: {0}".format (pmid_map_path))
                with open (pmid_map_path, "r") as stream:
                    pmid_date_map = json.loads (stream.read ())
                elapsed = round (time.time () - start, 2)
                print ("Read pmid date map in {0} seconds".format (elapsed))

                if pmid_date_map is None:
                    print ("Unable to load pmid date map")
                else:
                    start = time.time ()
                    pmid_date_map_broadcast = sc.broadcast (pmid_date_map)
                    annotated = Guesses.annotate (guesses, facts, pathway_facts, pmids, pmid_date_map_broadcast).cache ()
                    count = annotated.count ()
                    elapsed = round (time.time () - start, 2)
                
                logger.info ("Annotation[slice {0}]. {1} binaries in {2} seconds.".format (slice_n, count, elapsed))
                logger.info ("Generating annotated output for " + output_dir)
                os.makedirs (output_dir)


                train = annotated. \
                        filter (lambda b : b is not None and is_training (b))
                train.count ()

                train = train.map (lambda b : json.dumps (b, cls=BinaryEncoder))
                train.count ()
                train_out_dir = os.path.join (output_dir, 'train')
                train.saveAsTextFile ("file://" + train_out_dir)
                print ("   --> train: {0}".format (train_out_dir))
                
                test  = annotated. \
                        filter (lambda b : b is not None and not is_training (b)).\
                        map (lambda b : json.dumps (b, cls=BinaryEncoder))
                test_out_dir = os.path.join (output_dir, 'test')
                test.saveAsTextFile ("file://" + test_out_dir)
                print ("   --> test: {0}".format (test_out_dir))

                ''' Save CSV '''
                csv_output = "file://{0}".format (os.path.join (output_dir, "csv"))                
                annotated. \
                    map (to_csv_row). \
                    saveAsTextFile (csv_output)
                print ("   --> csv: {0}".format (csv_output))

        ''' Concatenate all csvs into one big one '''
        csv_dirs = os.path.join (conf.output_dir, "eval", "annotated")
        print ("scanning {0}".format (csv_dirs))
        csv_files = []
        for root, dirnames, filenames in os.walk (csv_dirs):
            for filename in fnmatch.filter(filenames, '*part-*'):
                if not "crc" in filename and "csv" in root:
                    file_name = os.path.join(root, filename)
                    csv_files.append (file_name)
        big_csv = os.path.join (conf.output_dir, "eval", "eval.csv")
        
        with open (big_csv, "w") as stream:
            stream.write ("#pubmed_id,pubmed_date_unix_epoch_time,pubmed_date_human_readable,binary_a_term,binary_b_term,paragraph_distance,sentence_distance,word_distance,flag_if_valid,time_until_verified,freq_sec_deriv\n")
            for f in csv_files:
                with open (f, "r") as in_csv:
                    for line in in_csv:
                        stream.write(line)
Esempio n. 8
0
 def trace_set (trace_level, label, rdd):
     logger = LoggingUtil.init_logging (__file__)
     if (logger.getEffectiveLevel() > trace_level):
         for g in rdd.collect ():
             print ("  {0}> {1}->{2}".format (label, g[0], g[1]))
Esempio n. 9
0
    def make_equiv_set (L, R, threshold=800):
        logger = LoggingUtil.init_logging (__file__)
        pairs = {}
        # Create all possible pairs
        for left in L:
            if left.word in skiplist:
                continue
            for right in R:
                if right.word in skiplist or right.word is left.word:
                    continue
                docDist = abs (left.docPos - right.docPos)
                if docDist < threshold:
                    key = "{0}@{1}".format (left.word, right.word)
                    binary = Binary (
                        id = 0,
                        L = left.word,
                        R = right.word,
                        docDist = docDist,
                        sentDist = abs ( left.sentPos - right.sentPos ),
                        paraDist = abs ( left.paraPos - right.paraPos ),
                        code = 1,
                        fact = False,
                        refs = [],
                        leftDocPos = left.docPos,
                        rightDocPos = right.docPos) 
                    if key in pairs:
                        pairs[key].append ( EqBinary (binary, left, right) )
                    else:
                        pairs[key] = [ EqBinary (binary, left, right) ]

        if log_trace:
            print ("")
            for k,v in pairs.iteritems ():
                for val in v: 
                    print ("  --: [{0}] -> [{1}]".format (k, val))

        # GroupBy (x,y)
        REk = []
        for key in pairs:
            REk_key = []
            if log_trace:
                print ("key: {0}".format (key))
            Ek = pairs[key]
            original_length = len (Ek)
            # Sort
            Ek.sort (key = lambda p: p.binary.docDist)
            while len(Ek) > 0:
                EquivalentSet.log_sorted (Ek)
                canonical = Ek[0]
                if log_trace:
                    print ("     2. canonical: {0}".format (canonical))
                # Min distance pair
                REk_key.append (canonical.binary)
                old = Ek
                Ek = [ e for e in Ek if not (e.L.docPos == canonical.L.docPos or 
                                             e.R.docPos == canonical.R.docPos) ]
                if log_trace:
                    for o in old:
                        if o not in Ek:
                            print "     3. discard {0}".format (o)
                EquivalentSet.log_reduced_set (REk_key, original_length, key)
            REk = REk + REk_key
        if log_trace:
            print ("REk: {0}".format (REk))
        return REk