Ejemplo n.º 1
0
def write_shs_verbal_indeclinables(adverbs_path, final_path, root_converter,
                                   out_path):
    """Write SHS verbal indeclinables."""
    labels = None
    clean_rows = []
    with util.read_csv(adverbs_path) as reader:
        for row in reader:
            root_pair = root_converter.get(row['root'])
            if root_pair is None:
                continue
            row['root'] = root_pair[0]
            row['hom'] = root_pair[1]
            clean_rows.append(row)

    with util.read_csv(final_path) as reader:
        for row in reader:
            root_pair = root_converter.get(row['root'])
            if root_pair is None:
                continue

            row['root'] = root_pair[0]
            row['hom'] = root_pair[1]
            # TODO: handle 'ya' gerunds
            if not row['form'].endswith('um'):
                continue
            clean_rows.append(row)

        labels = reader.fieldnames
        labels.insert(labels.index('root') + 1, 'hom')

    with util.write_csv(out_path, labels) as write_row:
        for row in clean_rows:
            write_row(row)
Ejemplo n.º 2
0
    def ncbi_idmapping_data(self):
        if not os.path.exists(path.join(SyncDB.DOWNLOAD_DIR(), "gene2refseq.gz")):
            urllib.urlretrieve("ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz", os.path.join(SyncDB.DOWNLOAD_DIR(), "gene2refseq.gz"))

        if not os.path.exists(path.join(SyncDB.DOWNLOAD_DIR(), "gene2ensembl.gz")):
            urllib.urlretrieve("ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2ensembl.gz", os.path.join(SyncDB.DOWNLOAD_DIR(), "gene2ensembl.gz"))

        if not os.path.exists(path.join(SyncDB.DOWNLOAD_DIR(), "gene_refseq_uniprotkb_collab.gz")):
            urllib.urlretrieve("ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_refseq_uniprotkb_collab.gz", os.path.join(SyncDB.DOWNLOAD_DIR(), "gene_refseq_uniprotkb_collab.gz"))
        
        gene2ref=util.read_csv(SyncDB.DOWNLOAD_DIR()+'/gene2refseq.gz', sep='\t', header=False, skiprows=1, names=['Tax_ID', 'GeneID', 'status','Refseq_RNA','RNA_nucleotide_gi','Refseq_Prot','protein_gi','genomic_nucleotide_accession.version','genomic_nucleotide_gi','start_position_on_the_genomic_accession','end_position_on_the_genomic_accession','orientation','assembly','mature_peptide_accession.version','mature_peptide_gi','Symbol'])[['Tax_ID', 'GeneID', 'Refseq_RNA','Refseq_Prot','Symbol', 'status']].query('status != "SUPPRESSED" and Refseq_RNA != "-" and Tax_ID == [9606,10090,10116]').drop_duplicates(); 

        gene2ref=gene2ref.iloc[np.where(gene2ref["GeneID"].notnull())[0]];
        gene2ref=gene2ref.iloc[np.where(gene2ref["Refseq_RNA"].notnull())[0]];
        gene2ref = gene2ref[gene2ref['Refseq_RNA'].str.contains("X")==False] #Remove computed Refseq
        ref2uniprot=util.read_csv(SyncDB.DOWNLOAD_DIR()+'/gene_refseq_uniprotkb_collab.gz', sep='\t', header=False, skiprows=1, names=['Refseq_Prot','Uniprot_ID']); 
        ref2uniprot=ref2uniprot.iloc[np.where(ref2uniprot["Refseq_Prot"].notnull())[0]].drop_duplicates(subset='Refseq_Prot', take_last=True).fillna('-')
        
        gene2ens=util.read_csv(SyncDB.DOWNLOAD_DIR()+'/gene2ensembl.gz', sep='\t', header=False, skiprows=1, names=['Tax_ID','GeneID','Ensembl_Gene','Refseq_RNA','Ensembl_Trans','Refseq_Prot','Ensembl_Prot']).query('Tax_ID == [9606,10090,10116]')[['Ensembl_Gene','Refseq_RNA','Ensembl_Trans','Ensembl_Prot']];
        gene2ens=gene2ens.iloc[np.where(gene2ens["Refseq_RNA"].notnull())[0]].drop_duplicates(subset='Refseq_RNA', take_last=True).fillna('-')
        
        
        f=lambda x: x.split('.')[0];
        gene2ref['Refseq_Prot']=gene2ref['Refseq_Prot'].map(f);
        gene2ref['Refseq_RNA']=gene2ref['Refseq_RNA'].map(f);
        gene2ens['Refseq_RNA']=gene2ens['Refseq_RNA'].map(f);        
        df = pd.merge(gene2ref, ref2uniprot, on='Refseq_Prot', how='left');
        df = pd.merge(df, gene2ens, on='Refseq_RNA', how='left');                     
        df["Source"] = "NCBI";
        df=df.fillna('-')
        print "NCBI ID mapping data processed..."
        return df;
Ejemplo n.º 3
0
def write_mw_prefixed_roots(prefixed_roots, unprefixed_roots, prefix_groups,
                            sandhi_rules, out_path):
    """Parse the prefixes in a prefix root and write the parsed roots."""

    with util.read_csv(prefix_groups) as reader:
        prefix_groups = {x['group']: x['prefixes'] for x in reader}
    with util.read_csv(unprefixed_roots) as reader:
        root_set = {(x['root'], x['hom']) for x in reader}

    candidate_homs = [None] + [str(i) for i in range(1, 10)]
    sandhi = make_sandhi_object(sandhi_rules)

    rows = []
    for row in util.read_csv_rows(prefixed_roots):
        for group in sandhi.split_off(row['prefixed_root'],
                                      row['unprefixed_root']):
            if group in prefix_groups:
                basis, hom = row['unprefixed_root'], row['hom']
                if (basis, hom) not in root_set:
                    for x in candidate_homs:
                        if (basis, x) in root_set:
                            hom = x
                            break
                    if (basis, hom) not in root_set:
                        continue

                rows.append((row['prefixed_root'], prefix_groups[group],
                             row['unprefixed_root'], hom))
                break

    labels = ['prefixed_root', 'prefixes', 'unprefixed_root', 'hom']
    with util.write_csv(out_path, labels) as write_row:
        for row in rows:
            write_row(dict(zip(labels, row)))
Ejemplo n.º 4
0
    def get_jax_annotations (self):    
        #urllib.urlretrieve("ftp://ftp.informatics.jax.org/pub/reports/HMD_HumanPhenotype.rpt", self.fn_dest_gene2phenotype)
        #urllib.urlretrieve("ftp://ftp.informatics.jax.org/pub/reports/VOC_MammalianPhenotype.rpt", self.fn_dest_reference)
        urllib.urlretrieve('http://www.informatics.jax.org/downloads/reports/HMD_HumanPhenotype.rpt', self.fn_dest_gene2phenotype)
        urllib.urlretrieve('http://www.informatics.jax.org/downloads/reports/VOC_MammalianPhenotype.rpt', self.fn_dest_reference)

        df_gene2phenotype = util.read_csv(self.fn_dest_gene2phenotype, names=['human_symbol', 'gid', 'homolo_gid', 'yes_no','mouse_marker', 'mgi_marker', 'phenotype_ids'], sep=r'\t', index_col=False);
        df_mgi_reference = util.read_csv(self.fn_dest_reference, names=['phenotype_id', 'name', 'description'], sep=r'\t', index_col=False);
        df_gene2phenotype['mgi_marker']= df_gene2phenotype['mgi_marker'].map(str.strip)
        df_mgi_reference['phenotype_id']= df_mgi_reference['phenotype_id'].map(str.strip)
        
        data = [];
        for index,r in df_gene2phenotype.iterrows():
            if r['phenotype_ids']:
                for pid in r['phenotype_ids'].split(' '):
                    data.append({'gid':r['gid'], 'phenotype_id':pid})
                
        df_gene2phenotype = pd.DataFrame(data); 
        df_join = pd.merge(df_gene2phenotype, df_mgi_reference, left_on='phenotype_id', right_on='phenotype_id', how='inner')
        
        data=[]
        for k,g in df_join.groupby('gid', as_index=False):
            data.append({'gid':k, 'content':'; '.join(g['name']), 'annotation_field1':'; '.join([x for x in g['description'] if x is not None]), 'tax_id':'9606'})
        
        pd.DataFrame(data).to_csv(self.fn_dest_jax_annotations, index=False);
Ejemplo n.º 5
0
def write_shs_verbal_indeclinables(adverbs_path, final_path, root_converter,
                                   out_path):
    """Write SHS verbal indeclinables."""
    labels = None
    clean_rows = []
    with util.read_csv(adverbs_path) as reader:
        for row in reader:
            root_pair = root_converter.get(row['root'])
            if root_pair is None:
                continue
            row['root'] = root_pair[0]
            row['hom'] = root_pair[1]
            clean_rows.append(row)

    with util.read_csv(final_path) as reader:
        for row in reader:
            root_pair = root_converter.get(row['root'])
            if root_pair is None:
                continue

            row['root'] = root_pair[0]
            row['hom'] = root_pair[1]
            # TODO: handle 'ya' gerunds
            if not row['form'].endswith('um'):
                continue
            clean_rows.append(row)

        labels = reader.fieldnames
        labels.insert(labels.index('root') + 1, 'hom')

    with util.write_csv(out_path, labels) as write_row:
        for row in clean_rows:
            write_row(row)
Ejemplo n.º 6
0
def write_mw_prefixed_roots(prefixed_roots, unprefixed_roots, prefix_groups,
                            sandhi_rules, out_path):
    """Parse the prefixes in a prefix root and write the parsed roots."""

    with util.read_csv(prefix_groups) as reader:
        prefix_groups = {x['group']: x['prefixes'] for x in reader}
    with util.read_csv(unprefixed_roots) as reader:
        root_set = {(x['root'], x['hom']) for x in reader}

    candidate_homs = [None] + [str(i) for i in range(1, 10)]
    sandhi = make_sandhi_object(sandhi_rules)

    rows = []
    for row in util.read_csv_rows(prefixed_roots):
        for group in sandhi.split_off(row['prefixed_root'],
                                      row['unprefixed_root']):
            if group in prefix_groups:
                basis, hom = row['unprefixed_root'], row['hom']
                if (basis, hom) not in root_set:
                    for x in candidate_homs:
                        if (basis, x) in root_set:
                            hom = x
                            break
                    if (basis, hom) not in root_set:
                        continue

                rows.append((row['prefixed_root'], prefix_groups[group],
                             row['unprefixed_root'], hom))
                break

    labels = ['prefixed_root', 'prefixes', 'unprefixed_root', 'hom']
    with util.write_csv(out_path, labels) as write_row:
        for row in rows:
            write_row(dict(zip(labels, row)))
Ejemplo n.º 7
0
    def get_ensembl2gid_df_web(self, tax_id, type):
        #mmusculus_gene_ensembl   10090
        #rnorvegicus_gene_ensembl 10116
        print "Get %s to gene id for %s" % (type, tax_id)
        attr = '<Attribute name = "' + type + '" />'
        db_name = self.get_dbname_by_taxid(tax_id)
        if db_name is None:
            return None

        fname = 'ensembl2gid_%s_%s' % (type, tax_id)

        util.unix('mkdir -p ' + SyncDB.DOWNLOAD_DIR() + '/ensembl_files')
        valid_files = []
        for chr in self.chrList:
            ensembl_file = SyncDB.DOWNLOAD_DIR(
            ) + "/ensembl_files/%s_chr%s" % (fname, chr)
            print "downloading %s from %s for chr %s..." % (type, db_name, chr)
            if not os.path.exists(ensembl_file):
                cmd = 'wget -O ' + ensembl_file + ' \'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0"\
                encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default"\
                formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "' + db_name + '" interface = "default"><Filter name="chromosome_name" value="' + chr + '" filter_list=""/>' + attr + '<Attribute name = "entrezgene"\
                /></Dataset></Query>\''

                util.unix(cmd)
            try:
                tdf = util.read_csv(ensembl_file,
                                    sep="\t",
                                    header=None,
                                    names=['source_id', 'gid'],
                                    nrows=1)
            except Exception as exp:
                tdf = pd.DataFrame()

            if len(tdf) != 0:
                valid_files.append(ensembl_file)

        if len(valid_files) == 0:
            return None

        cmd = 'cat %s >> %s' % (' '.join(valid_files), SyncDB.DOWNLOAD_DIR() +
                                "/ensembl_files/" + fname)
        print cmd
        util.unix(cmd)
        print "downloading %s from %s done." % (type, db_name)
        ensembl_data = util.read_csv(SyncDB.DOWNLOAD_DIR() +
                                     "/ensembl_files/" + fname,
                                     sep="\t",
                                     header=None,
                                     names=['source_id', 'gid'])
        ensembl_data = ensembl_data[ensembl_data['gid'].notnull()]
        ensembl_data = ensembl_data[ensembl_data['source_id'].notnull()]
        ensembl_data[['gid']] = ensembl_data[['gid']].astype(int)
        ensembl_data['tax_id'] = tax_id
        ensembl_data['type_name'] = type
        return ensembl_data
Ejemplo n.º 8
0
def make_root_converter(shs_roots_path, shs_blacklist_path, shs_override_path,
                        mw_unprefixed_roots_path):
    """Returns a dict that maps SHS roots to MW roots.

    Specifically, the dict maps strings to a list of (root, hom) tuples.
    """
    with util.read_csv(shs_blacklist_path) as reader:
        blacklist = {x['name'] for x in reader}

    with util.read_csv(shs_override_path) as reader:
        override = {x['shs']: x['mw'] for x in reader}

    # (root, class) -> [shs_root]
    class_pair_to_shs_roots = {}
    with util.read_csv(shs_roots_path) as reader:
        for row in reader:
            shs_root = row['root']
            vclass = row['class']

            clean_root = shs_root
            if shs_root in blacklist:
                clean_root = None
            elif shs_root in override:
                clean_root = override[shs_root]

            if clean_root is None:
                continue
            clean_root = shs_root.partition('#')[0]
            class_pair_to_shs_roots.setdefault((clean_root, vclass),
                                               set()).add(shs_root)
    assert len(class_pair_to_shs_roots.keys()) > 0

    # (root, class) -> [(mw_root, hom)]
    class_pair_to_mw_roots = {}
    with util.read_csv(mw_unprefixed_roots_path) as reader:
        for row in reader:
            root, hom, vclass = row['root'], row['hom'], row['class']
            class_pair_to_mw_roots.setdefault((root, vclass), []).append(
                (root, hom))
    assert len(class_pair_to_mw_roots.keys()) > 0

    # shs_root -> (mw_root, hom)
    converter = {}
    for shs_pair in class_pair_to_shs_roots:
        if shs_pair not in class_pair_to_mw_roots:
            continue
        shs_roots = class_pair_to_shs_roots[shs_pair]
        for shs_root in shs_roots:
            for mw_root in class_pair_to_mw_roots[shs_pair]:
                converter[shs_root] = mw_root

    assert len(converter.keys()) > 0
    return converter
Ejemplo n.º 9
0
def make_root_converter(shs_roots_path, shs_blacklist_path, shs_override_path,
                        mw_unprefixed_roots_path):
    """Returns a dict that maps SHS roots to MW roots.

    Specifically, the dict maps strings to a list of (root, hom) tuples.
    """
    with util.read_csv(shs_blacklist_path) as reader:
        blacklist = {x['name'] for x in reader}

    with util.read_csv(shs_override_path) as reader:
        override = {x['shs']: x['mw'] for x in reader}

    # (root, class) -> [shs_root]
    class_pair_to_shs_roots = {}
    with util.read_csv(shs_roots_path) as reader:
        for row in reader:
            shs_root = row['root']
            vclass = row['class']

            clean_root = shs_root
            if shs_root in blacklist:
                clean_root = None
            elif shs_root in override:
                clean_root = override[shs_root]

            if clean_root is None:
                continue
            clean_root = shs_root.partition('#')[0]
            class_pair_to_shs_roots.setdefault((clean_root, vclass),
                                               set()).add(shs_root)
    assert len(class_pair_to_shs_roots.keys()) > 0

    # (root, class) -> [(mw_root, hom)]
    class_pair_to_mw_roots = {}
    with util.read_csv(mw_unprefixed_roots_path) as reader:
        for row in reader:
            root, hom, vclass = row['root'], row['hom'], row['class']
            class_pair_to_mw_roots.setdefault((root, vclass),
                                              []).append((root, hom))
    assert len(class_pair_to_mw_roots.keys()) > 0

    # shs_root -> (mw_root, hom)
    converter = {}
    for shs_pair in class_pair_to_shs_roots:
        if shs_pair not in class_pair_to_mw_roots:
            continue
        shs_roots = class_pair_to_shs_roots[shs_pair]
        for shs_root in shs_roots:
            for mw_root in class_pair_to_mw_roots[shs_pair]:
                converter[shs_root] = mw_root

    assert len(converter.keys()) > 0
    return converter
Ejemplo n.º 10
0
def main(unused_argv):
    training_data, training_target = util.read_csv(TRAINING)
    testing_data, testing_target = util.read_csv(TESTING)

    training = util.DataSet(training_data, training_target)
    test = util.DataSet(testing_data, testing_target)

    x = tf.placeholder(tf.float32, [None, 9], name="x")
    y_ = tf.placeholder(tf.float32, [None, 1], name="y_")

    y_conv, keep_prob = deepnn(x)

    with tf.name_scope('loss'):
        cross_entropy = tf.square(y_conv - y_)

    cross_entropy = tf.reduce_mean(cross_entropy)

    with tf.name_scope('adam_optimizer'):
        train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)

    with tf.name_scope('accuracy'):
        correct_prediction = tf.equal(tf.cast(y_conv, tf.int32, name="output"),
                                      tf.cast(y_, tf.int32, name="target"))
        correct_prediction = tf.cast(correct_prediction, tf.float32)
    accuracy = tf.reduce_mean(correct_prediction, name="predict_op")

    saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for i in range(50000):
            batch = training.next_batch(50)
            if i % 1000 == 0:
                training_accuracy = cross_entropy.eval(feed_dict={
                    x: batch[0],
                    y_: batch[1],
                    keep_prob: 1.0
                })
                print('step %d, loss %g' % (i, training_accuracy))
            train_step.run(feed_dict={
                x: batch[0],
                y_: batch[1],
                keep_prob: 0.5
            })

        print('test loss %g' % cross_entropy.eval(feed_dict={
            x: test._images,
            y_: test._labels,
            keep_prob: 1.0
        }))
        saver.save(sess, "model_1")
Ejemplo n.º 11
0
def main(unused_argv):
    training_data, training_target = util.read_csv(TRAINING)
    testing_data, testing_target = util.read_csv(TESTING)

    training = util.DataSet(training_data, training_target)
    test = util.DataSet(testing_data, testing_target)

    x = tf.placeholder(tf.float32, [None, 4], name="x")
    y_ = tf.placeholder(tf.float32, [None, 3], name="y_")

    y_conv, keep_prob = deepnn(x)

    with tf.name_scope('loss'):
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=y_,
                                                                logits=y_conv)

    cross_entropy = tf.reduce_mean(cross_entropy)

    with tf.name_scope('adam_optimizer'):
        train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)

    with tf.name_scope('accuracy'):
        correct_prediction = tf.equal(tf.argmax(y_conv, 1, name="output"),
                                      tf.argmax(y_, 1, name="target"))
        correct_prediction = tf.cast(correct_prediction, tf.float32)
    accuracy = tf.reduce_mean(correct_prediction, name="predict_op")

    saver = tf.train.Saver()

    with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
        sess.run(tf.global_variables_initializer())
        for i in range(20000):
            batch = training.next_batch(20)
            arr = convertLabels(batch[1])
            if i % 1000 == 0:
                training_accuracy = accuracy.eval(feed_dict={
                    x: batch[0],
                    y_: arr,
                    keep_prob: 1.0
                })
                print('step %d, training accuracy %g' % (i, training_accuracy))
            train_step.run(feed_dict={x: batch[0], y_: arr, keep_prob: 0.5})

        arr = convertLabels(test._labels)
        print('test accuracy %g' % accuracy.eval(feed_dict={
            x: test._images,
            y_: arr,
            keep_prob: 1.0
        }))
        saver.save(sess, "model_1")
Ejemplo n.º 12
0
    def get_ensembl2gid_map_old():
        if hasattr(GPUtils, 'ensembl2gid_map'):
            return GPUtils.ensembl2gid_map
        hgnc_file = path.join(SyncDB.DOWNLOAD_DIR(), "hgnc_complete_set.txt")
        ensembl_file = path.join(SyncDB.DOWNLOAD_DIR(),
                                 "hsapiens_gene_ensembl__gene__main.txt.gz")

        if not os.path.exists(hgnc_file):
            urllib.urlretrieve(
                "ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt",
                hgnc_file)
        if not os.path.exists(ensembl_file):
            urllib.urlretrieve(GPUtils.get_ensembl_mart_url(), ensembl_file)

        hgnc_data = util.read_csv(hgnc_file, sep="\t")[[
            'hgnc_id', 'symbol', 'ensembl_gene_id', 'entrez_id'
        ]]
        #Tracer()();

        ensembl_data = util.read_csv(ensembl_file,
                                     header=None,
                                     skiprows=1,
                                     sep="\t")[[4, 5, 6]]
        hgnc_lookup = {}
        for i in hgnc_data.index:
            hgnc_lookup[hgnc_data.at[i, 'hgnc_id']] = hgnc_data.at[i,
                                                                   'entrez_id']

        out = []
        lookup = {}
        for i in ensembl_data.index:
            hgnc_id = ensembl_data.at[i, 4]
            l = hgnc_id.find("HGNC:")
            if l < 0:
                #print "HGNC: was not found in ", hgnc_id;
                continue
            hgnc_id = hgnc_id[l:-1]

            if hgnc_id in hgnc_lookup:
                lookup[ensembl_data.at[i, 6]] = hgnc_lookup[hgnc_id]
                out.append({
                    'ensembl_gene_id': ensembl_data.at[i, 6],
                    'gene_id': hgnc_lookup[hgnc_id]
                })

        pd.DataFrame(out).sort(['ensembl_gene_id'
                                ]).to_csv('ebi_ensembl_map.csv', index=False)
        #Tracer()()
        GPUtils.ensembl2gid_map = lookup
        return lookup
Ejemplo n.º 13
0
    def get_variations(self):
        ensembl_file = SyncDB.DOWNLOAD_DIR(
        ) + "/ensembl_files/ensembl_variations.csv"
        print "Processing variations"

        if os.path.exists(ensembl_file):
            t = util.read_csv(ensembl_file)
        else:
            con = self.get_ensembl_connection(
                EnsemblDownload.get_ensembl_latest_version(
                    'homo_sapiens_variation_{0}_'.format(
                        EnsemblDownload.ENSEMBL_VERSION)))
            query = "select distinct pf.object_id as variation_name,p.description,v.clinical_significance,vg.gene_name, s.name as source_name from source s, phenotype_feature pf, phenotype p, variation v, variation_genename vg where pf.type ='Variation' and pf.phenotype_id = p.phenotype_id and v.name=pf.object_id and v.variation_id=vg.variation_id and v.source_id=s.source_id and v.clinical_significance in ('likely pathogenic','pathogenic','risk factor','association','drug response')"
            t = db.from_sql(con, query, params=[])
            t.to_csv(ensembl_file, index=False)

        map = GPUtils.get_sym2gid_map()["sym2gid"]
        data = []
        for gene, row in t.groupby(['gene_name']):
            if gene in map:
                #Tracer()()
                content = [
                    '[' + r[1]['variation_name'] + '] ' + r[1]['description'] +
                    '{' + r[1]['clinical_significance'] + '}(' +
                    r[1]['source_name'] + ')' for r in row.iterrows()
                ]
                data.append({
                    'gid': map[gene],
                    'content': ';'.join(content),
                    'annotation_field1': gene,
                    'type_name': 'VARIATIONS_ENSEMBL',
                    'tax_id': '9606'
                })

        return data
Ejemplo n.º 14
0
    def get_annotation_martdb(self, tax_id, a_type, is_boolean=False):
        import math
        db_name = self.get_dbname_by_taxid(tax_id)
        if db_name is None:
            return None
        file = SyncDB.DOWNLOAD_DIR() + "/ensembl_files/biomart_%s_%s.csv" % (
            a_type, tax_id)
        print "Running query to get %s for %s from martdb" % (a_type, tax_id)

        query = self.get_annotation_mart_query(a_type, db_name)

        con = self.get_biomart_connection()
        try:
            if os.path.exists(file):
                df = util.read_csv(file)
            else:
                df = db.from_sql(con, query).drop_duplicates()
                df.to_csv(file, index=False)
        except:
            print "error in getting %s data for %s" % (a_type, tax_id)
            return None

        data = []
        #Tracer()()
        for k, grow in df.groupby(['gid']):
            # Tracer()()
            cnt = []
            for i in grow.index:
                v1 = grow.at[i, "term"]
                v2 = grow.at[i, "description"]
                try:
                    if type(v1) is str or not math.isnan(v1):
                        try:
                            if type(v2) is str or not math.isnan(v2):
                                cnt.append('[%s] %s' % (str(v1), str(v2)))
                            else:
                                cnt.append(str(v1))
                        except:
                            cnt.append(str(v1))
                except:
                    pass

            cnt = pd.unique(cnt)
            content = ''
            if is_boolean:
                if len(cnt) > 0:
                    content = "Yes"
            else:
                content = ';'.join(cnt)

            if content != '':
                data.append({
                    'gid': k,
                    'content': content,
                    'annotation_field1': grow.at[i, 'gene'],
                    'type_name': a_type,
                    'tax_id': tax_id
                })

        return data
Ejemplo n.º 15
0
    def get_ensembl2gid_df_not_used(self, tax_id, type):
        print "Get %s to gene id for %s" % (type, tax_id)
        source_file_name = self.get_idmap_source_file(type, tax_id)
        if source_file_name is None:
            return None

        source_file_name = SyncDB.DOWNLOAD_DIR(
        ) + "/ensembl_files/" + source_file_name

        out_file = SyncDB.DOWNLOAD_DIR(
        ) + "/ensembl_files/" + 'ensembl2gid_%s_%s' % (type, tax_id)

        if not os.path.exists(source_file_name):
            urllib.urlretrieve(
                self.mart_ftp + "/" + self.get_idmap_source_file(type, tax_id),
                source_file_name)

        ensembl_data = util.read_csv(source_file_name,
                                     sep="\t",
                                     header=None,
                                     names=['source_id', 'gid'])
        ensembl_data = ensembl_data[ensembl_data['gid'].notnull()]
        ensembl_data = ensembl_data[ensembl_data['source_id'].notnull()]
        ensembl_data[['gid']] = ensembl_data[['gid']].astype(int)
        ensembl_data['tax_id'] = tax_id
        ensembl_data['type_name'] = type
        return ensembl_data
Ejemplo n.º 16
0
    def ensembl_trans2gene_map_by_taxid(self, tax_id):     
        if tax_id == 9606:
            db_name="hsapiens_gene_ensembl";
            fname="ensembl_genes_human_trans2gid_map.csv";
        elif tax_id == 10090:
            db_name="mmusculus_gene_ensembl";
            fname="ensembl_genes_mouse_trans2gid_map.csv";
        elif tax_id == 10116:
            db_name="rnorvegicus_gene_ensembl";
            fname="ensembl_genes_rat_trans2gid_map.csv";

        util.unix('mkdir -p ' + SyncDB.DOWNLOAD_DIR() + '/ensembl_files');       
        ensembl_file = SyncDB.DOWNLOAD_DIR() + "/ensembl_files/%s"%fname; 
        
        if not os.path.exists(ensembl_file ):
            cmd = 'wget -O ' + ensembl_file + ' \'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0"\
            encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default"\
            formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "' + db_name + '" interface = "default">\
            <Attribute name = "ensembl_gene_id"/><Attribute name = "ensembl_transcript_id"/><Attribute name = "ensembl_peptide_id"/>\
            </Dataset></Query>\'';
            util.unix(cmd);
            
        
        df=util.read_csv(ensembl_file, sep='\t', names=['Ensembl_Gene','Ensembl_Trans','Ensembl_Prot']);
        df=df.iloc[np.where(df["Ensembl_Trans"].notnull())[0]]
        df=df.drop_duplicates();
        
        df["Tax_ID"]=tax_id;
        
        return df;
Ejemplo n.º 17
0
    def create_term_file(self):
        kwargs = {}
        if 'oldCols' in self.options:
            kwargs['names'] = self.options['oldCols'].split(',')
        if 'read_csv' in self.options:
            for kv_str in self.options['read_csv'].split(','):
                kv = kv_str.split('=')
                kwargs[kv[0]] = kv[1]
                if kv[1] == 'None':
                    kwargs[kv[0]] = None
                if kv[0].lower() == 'skiprows':
                    kwargs[kv[0]] = int(kv[1])

        iter_csv = util.read_csv(self.fn_source,
                                 iterator=True,
                                 chunksize=self.get_chunksize(),
                                 dtype=str,
                                 **kwargs)
        term_id_col = 'term_id' if 'term_id' not in self.column_map else self.column_map[
            'term_id']
        term_ids = []
        for chunk in iter_csv:
            term_ids += util.unique(chunk[term_id_col])
        term_ids = util.unique(term_ids)

        with open(self.fn_dest, "w") as myfile:
            wr = csv.writer(myfile)
            wr.writerow(['term_id', 'term_name', 'term_type'])
            wr.writerows([[
                term_id, self.term_name if self.term_name else term_id,
                self.options['typeName']
            ] for term_id in term_ids])
Ejemplo n.º 18
0
    def instantiate_table_as_pandas_dataframe(self, table_name, **kwargs):

        #!!! Prolly should check to make sure cache exists.

        # commenting out until encoding issue fixed
        # if self.config['tables'][table_name]['cache']['type'] != 'flat_file':
        #     raise ValueError("Table '+table_name+' cannot be instantiated, because it's not cached as a flat_file")

        # Replace from here...
        # filename = self.config['provenance']['data_filepath'] + \
        #     self.config['tables'][table_name]['cache']['filename']

        # df = pd.read_csv(filename, **kwargs)

        # lnc_df = util.convert_pandas_dataframe_to_lncPandasDataFrame(
        #     df,
        #     self.config['tables'][table_name],
        #     parent_pipeline=self
        # )
        # ...to here.

        # Replacement code.
        # Try this soon...
        table_config = self.config["tables"][table_name]
        filename = self.config["provenance"]["data_filepath"] + table_config["cache"]["filename"]

        lnc_df = util.read_csv(filename, table_config, **kwargs)

        self.instantiated_tables[table_name] = lnc_df

        self.I = self.instantiated_tables

        # return df
        return lnc_df
Ejemplo n.º 19
0
    def get_ensembl2gid_map_ensembl():
        if hasattr(GPUtils, 'ensembl2gid_map_ensembl'):
            return GPUtils.ensembl2gid_map_ensembl

        ensembl_file = path.join(SyncDB.DOWNLOAD_DIR(),
                                 "ensembl_genes_info.csv")

        if not os.path.exists(ensembl_file):
            cmd = 'wget -O ' + ensembl_file + ' - \'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0"\
            encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default"\
            formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "hsapiens_gene_ensembl" interface = "default"\
            ><Attribute name = "ensembl_gene_id"/><Attribute name = "entrezgene"/></Dataset></Query>\''

            util.unix(cmd)

        ensembl_data = util.read_csv(ensembl_file,
                                     sep="\t",
                                     header=None,
                                     names=['ensembl_gene_id', 'gene_id'])
        ensembl_data = ensembl_data[ensembl_data['gene_id'].notnull()]
        ensembl_data[['gene_id']] = ensembl_data[['gene_id']].astype(int)
        GPUtils.ensembl2gid_map_ensembl = {}
        for i in ensembl_data.index:
            GPUtils.ensembl2gid_map_ensembl[ensembl_data.at[
                i, 'ensembl_gene_id']] = GPUtils.ensembl2gid_map_ensembl.get(
                    ensembl_data.at[i, 'ensembl_gene_id']) or []
            GPUtils.ensembl2gid_map_ensembl[ensembl_data.at[
                i, 'ensembl_gene_id']].append(ensembl_data.at[i, 'gene_id'])
        #Tracer()()
        return GPUtils.ensembl2gid_map_ensembl
Ejemplo n.º 20
0
 def populate_product(self):
     """
     Insert data into product table in the database
     """
     df = read_csv(self.file_name)
     for idx, row in df.iterrows():
         product_name = row['product_name']
         bar_code = row['code']
         url = row['url']
         nutrition_grade = row['nutrition_grade_fr']
         energy = row['energy']
         proteins = row['proteins']
         category_names = row['main_category']
         store_names = row['stores_y']
         new_product = Product(product_name=product_name,
                               bar_code=bar_code,
                               url=url,
                               nutrition_grade=nutrition_grade,
                               energy=energy,
                               proteins=proteins)
         new_category = sess.query(Category).filter(
             Category.category_name == category_names).first()
         new_store = sess.query(Store).filter(
             Store.store_name == store_names).first()
         if new_store is None:
             new_store = Store(store_name=store_names)
             sess.add(new_store)
         new_product.categorys.append(new_category)
         new_product.stores.append(new_store)
         sess.add(new_product)
         sess.commit()
         sess.close()
Ejemplo n.º 21
0
    def get_ensembl2gid_map_ncbi():
        if hasattr(GPUtils, 'ensembl2gid_map_ncbi'):
            return GPUtils.ensembl2gid_map_ncbi

        if not os.path.exists(
                path.join(SyncDB.DOWNLOAD_DIR(), "gene2ensembl.gz")):
            urllib.urlretrieve(
                "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2ensembl.gz",
                os.path.join(SyncDB.DOWNLOAD_DIR(), "gene2ensembl.gz"))
        gene2ens = util.read_csv(
            os.path.join(SyncDB.DOWNLOAD_DIR(), 'gene2ensembl.gz'),
            header=None,
            skiprows=1,
            sep="\t",
            names=[
                "tax_id", "GeneID", "Ensembl_gene_identifier",
                "RNA_nucleotide_accession.version", "Ensembl_rna_identifier",
                "protein_accession.version", "Ensembl_protein_identifier"
            ]).query('tax_id in [9606]')[['GeneID', 'Ensembl_gene_identifier']]
        gene2ens = gene2ens.drop_duplicates()
        GPUtils.ensembl2gid_map_ncbi = {}
        for i in gene2ens.index:
            GPUtils.ensembl2gid_map_ncbi[gene2ens.at[
                i,
                'Ensembl_gene_identifier']] = GPUtils.ensembl2gid_map_ncbi.get(
                    gene2ens.at[i, 'Ensembl_gene_identifier']) or []
            GPUtils.ensembl2gid_map_ncbi[gene2ens.at[
                i, 'Ensembl_gene_identifier']].append(gene2ens.at[i, 'GeneID'])
        #Tracer()()
        return GPUtils.ensembl2gid_map_ncbi
Ejemplo n.º 22
0
    def get_gene_pathway_map(self):
        print 'Getting GeneGo pathway data'
        #df = self.fetch("select distinct g17.ref as gid, m.MAPNAME as term_name, m.imid as term_id from genemaps gm, imagemap_table m, genedbs_17 g17, geneorgs go, genes g where gm.im=m.imid and gm.gene=g17.gene and go.gene = gm.gene and go.org=1 and g.geneid=go.gene");
        fn = self.dir + "/pathway.csv"
        if not os.path.exists(fn):
            df = self.fetch(
                "select distinct i.imid as term_id, i.imagename url, i.mapname term_name, d.ref as gid, orgs.taxonomyid as tax_id from pw_imagemap_shapes s, pw_imagemap_class c, imagemap_table i, gene_netw n, genedbs d, geneorgs o, orgs where s.id=c.shape_id and i.imid=s.im and n.id=c.object_id and d.gene=n.gene and o.gene=n.gene and i.publish=1 and o.org=orgs.orgid and orgs.taxonomyid in (%s) and d.db=17"
                % (','.join(self.taxidList)))
            #Tracer()()
            df['TERM_ID'] = 'gMAP' + df['TERM_ID'].map(str)
            df.rename2({
                "TERM_ID": "term_id",
                "TERM_NAME": "term_name",
                "GID": "gid",
                "URL": "term_field1",
                "TAX_ID": "tax_id"
            })
            df['type_name'] = 'GeneGo Pathway'
            df.to_csv(fn, index=False)
        else:
            df = util.read_csv(fn)

        self.pathway_gid2term = df

        df2 = pd.DataFrame(df.copy())[['term_id', 'term_name', 'type_name']]
        df2['description'] = df2['term_name']
        df2 = df2.drop_duplicates()
        self.pathway_terms = df2
        self.pathway_done = True
        print 'GeneGo pathway data captured'
Ejemplo n.º 23
0
    def get_ucsc2gid_df(self, tax_id):
        #mmusculus_gene_ensembl   10090
        #rnorvegicus_gene_ensembl 10116
        db_name = self.get_dsname_by_taxid(tax_id)
        if db_name is None:
            return pd.DataFrame()

        util.unix('mkdir -p ' + SyncDB.DOWNLOAD_DIR() + '/ucsc_files')
        file = SyncDB.DOWNLOAD_DIR() + "/ucsc_files/ucscid2gid_%s" % tax_id

        if not os.path.exists(file):
            con = db.get_con('UCSC')
            try:
                df = db.from_sql(
                    con,
                    "select name as source_id, value as gid from %s.knownToLocusLink"
                    % db_name)
            except Exception as exp:
                return pd.DataFrame()
            df.to_csv(file, index=False)

        data = util.read_csv(file)
        data = data[data['gid'].notnull()]
        data = data[data['source_id'].notnull()]
        data[['gid']] = data[['gid']].astype(int)
        data['tax_id'] = tax_id
        return data
Ejemplo n.º 24
0
def get_co_authors(paper_csv):
    papers = read_csv(paper_csv, [
        'first_name', 'last_name', 'keys', 'valid', 'pub_key', 'pub_title',
        'put_year', 'pub_authors'
    ])
    papers_dic = {}
    for p in papers:
        a_id = int(p['id'])
        if a_id not in papers_dic:
            papers_dic[a_id] = {}
            papers_dic[a_id]['first_name'] = p['first_name']
            papers_dic[a_id]['last_name'] = p['last_name']
            papers_dic[a_id]['keys'] = set([p['keys']])
            papers_dic[a_id]['pubs'] = []
            papers_dic[a_id]['co-authors'] = {}

        a_dic = papers_dic[a_id]

        if p['valid']:
            pub = (p['pub_key'], p['pub_title'])
            a_dic['pubs'].append(pub)
            for co_a in p['pub_authors'].split(";"):
                if co_a not in a_dic['co-authors']:
                    a_dic['co-authors'][co_a] = [pub]
                else:
                    a_dic['co-authors'][co_a].append(pub)

    return papers_dic
Ejemplo n.º 25
0
    def get_ensembl2gid_map_ebi():
        if hasattr(GPUtils, 'ensembl2gid_map_epi'):
            return GPUtils.ensembl2gid_map_ebi

        hgnc_file = path.join(SyncDB.DOWNLOAD_DIR(), "hgnc_complete_set.txt")

        if not os.path.exists(hgnc_file):
            urllib.urlretrieve(
                "ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt",
                hgnc_file)

        hgnc_data = util.read_csv(hgnc_file, sep="\t")[[
            'hgnc_id', 'symbol', 'ensembl_gene_id', 'entrez_id'
        ]]
        #Tracer()();

        GPUtils.ensembl2gid_map_ebi = {}
        for i in hgnc_data.index:
            GPUtils.ensembl2gid_map_ebi[hgnc_data.at[
                i, 'ensembl_gene_id']] = GPUtils.ensembl2gid_map_ebi.get(
                    hgnc_data.at[i, 'ensembl_gene_id']) or []
            GPUtils.ensembl2gid_map_ebi[hgnc_data.at[
                i, 'ensembl_gene_id']].append(hgnc_data.at[i, 'entrez_id'])
        #Tracer()()

        return GPUtils.ensembl2gid_map_ebi
Ejemplo n.º 26
0
 def preprocess(self):
     
     #page_code와 label 불러오기
     self.pcd, self.code2name = prep_pagecd(read_csv(cfg.pgcd))
     self.lbl = read_csv(cfg.label)   
     
     for date in self.date_ranges:
         print(f"### {date} 데이터 전처리 시작")
         merged_df = self._preprocess_per_mth(date)
         
         csvpath = os.path.join('s3://', cfg.data_dir, seqfname(date.year, date.month))
         merged_df.to_csv(csvpath)
         print(f'### {date} 데이터 전처리 완료')
         print()
         
     return merged_df
Ejemplo n.º 27
0
def main():
    # Reads from the data file and runs estimate for each row
    # Then plots the trajectory
    data_array = util.read_csv(config.DATASET_ABSOLUTE_PATH)

    row = data_array[0]
    time, encoder, angular_velocity, steering_angle = np.ravel(row)
    resulting_pos_heading = []
    pose_estimator = PoseEstimator((0, 0, 0), time, encoder, angular_velocity,
                                   steering_angle)
    i = 1
    while i < len(data_array):
        row = data_array[i]
        time, encoder, angular_velocity, steering_angle = np.ravel(row)
        x, y, heading = pose_estimator.estimate(
            time=time,
            steering_angle=steering_angle,
            encoder_ticks=encoder,
            angular_velocity=angular_velocity)
        resulting_pos_heading.append([x, y, heading])
        i = i + 1
    visualizer.plot_points(
        np.asarray(resulting_pos_heading)[:, 0],
        np.asarray(resulting_pos_heading)[:, 1])
    visualizer.show()
Ejemplo n.º 28
0
def write_shs_verbal_data(data_path, root_converter, out_path):
    """Write Sanskrit Heritage Site data after converting its roots.

    :param data_path: path to the actual verb data
    :param blacklist_path: path to a list of blacklisted roots
    :param override_path: path to a map from SHS roots to MW roots. If a root
                          isn't in this map, assume the SHS roots are just fine.
    :param out_path:
    """
    labels = None
    clean_rows = []
    with util.read_csv(data_path) as reader:
        for row in reader:
            root_pair = root_converter.get(row['root'])
            if root_pair is None:
                continue
            root, hom = root_pair
            row['root'] = root
            row['hom'] = hom
            clean_rows.append(row)
        labels = reader.fieldnames
        labels.insert(labels.index('root') + 1, 'hom')

    with util.write_csv(out_path, labels) as write_row:
        for row in clean_rows:
            write_row(row)
Ejemplo n.º 29
0
def main():
    # Runs pipeline
    # accuracy_vs_binning(False)

    # Read in data from csv
    X, y = util.read_csv(FILE,
                         normalize=True,
                         mean_center=True,
                         do_bin=False,
                         bin_step=25)

    #plot_labels(y)

    # plot label frequency
    # plot_labels(y)

    # plot correlation
    # plot_correlation(X, y)

    # Partition data into train and test datasets
    X_train, y_train, X_test, y_test = partition(X, y)

    print(np.std(y_test))
    # Uncomment below to test Random Forest
    # run_pipeline_rf(X_train, y_train, X_test, y_test)

    #Uncomment below to test sklearn FC
    run_pipeline_mlp(X_train, y_train, X_test, y_test)
Ejemplo n.º 30
0
    def do_post_update(self):
		#The synonyms are concatenated with '|' for each gene. This post_update creates a separate record for each synonym.
        df = util.read_csv(self.fn_dest,dtype='S300')
        h = df.header()
        gid_col_name = h[0]
        synonyms_col_name = h[1]
        tax_id_col_name = h[2]
        rows = []
        for i, r in df.iterrows():
            for x in str(r[synonyms_col_name]).split('|'):
                import re
                m = re.search('([^:]*):(.*)',x)
                if m and len(m.groups())==2:
                    db = m.group(1)
                    xref = m.group(2)
                    tax_id  = r[tax_id_col_name]
                    if db in ['MGI', 'WormBase','SGD','FLYBASE','ZFIN','RGD','Araport']:
                        rows.append({
                            gid_col_name: r[gid_col_name],
                            synonyms_col_name:xref,
                            tax_id_col_name:tax_id,
                            'id_status':db
                           })
        df = pd.DataFrame(rows)
        df.to_csv(self.fn_dest, index=False)
Ejemplo n.º 31
0
 def find_by_symbols(self, path):
     """
     Helps finding subreddits for coins which cannot be found with the find_subreddits command.
     path: Path to file with a list of coin SYMBOLS (i.e. BTC, ETH, ...)
     """
     a = util.read_csv(path)
     symbols = [s[0] for s in a]
     known_coin_name_array = util.read_subs_from_file(settings.general["subreddit_file"])
     not_found, found = util.known_subs_for_symbols(known_coin_name_array, symbols)
     cap = CoinCap()
     coins = cap.get_coin_aliases(1000)
     coin_name_array = []
     # try finding remaining coins
     for coin in coins:
         if coin[-1] in not_found:
             coin_name_array.append(coin)
     if(len(symbols) != len(coin_name_array)):
         log.info("No coin data for {} coins.".format(len(symbols) - len(coin_name_array)))
     coin_name_array = sorted(coin_name_array, key=lambda c: c[-1])
     for coin_tuple in coin_name_array:
         coin_tuple.append("".join(x for x in coin_tuple[0] if x.isalnum()))
     subreddit_list = self.find_subreddits([name[-1] for name in coin_name_array])
     for i,coin in enumerate(coin_name_array):
         coin[-1] = subreddit_list[i]
     return coin_name_array, found
Ejemplo n.º 32
0
def write_shs_verbal_data(data_path, root_converter, out_path):
    """Write Sanskrit Heritage Site data after converting its roots.

    :param data_path: path to the actual verb data
    :param blacklist_path: path to a list of blacklisted roots
    :param override_path: path to a map from SHS roots to MW roots. If a root
                          isn't in this map, assume the SHS roots are just fine.
    :param out_path:
    """
    labels = None
    clean_rows = []
    with util.read_csv(data_path) as reader:
        for row in reader:
            root_pair = root_converter.get(row['root'])
            if root_pair is None:
                continue
            root, hom = root_pair
            row['root'] = root
            row['hom'] = hom
            clean_rows.append(row)
        labels = reader.fieldnames
        labels.insert(labels.index('root') + 1, 'hom')

    with util.write_csv(out_path, labels) as write_row:
        for row in clean_rows:
            write_row(row)
Ejemplo n.º 33
0
    def get_rna_expression(self):
        self.get_ensembl2gid_map=GPUtils.get_ensembl2gid_map();  
        #"Gene","Tissue","Cell type","Level","Expression type","Reliability"        
        data_file = os.path.join(SyncDB.DOWNLOAD_DIR(),"protein_atlas/rna.csv.zip")
        urllib.urlretrieve("http://www.proteinatlas.org/download/rna.csv.zip", data_file)
            
        data = util.read_csv(data_file)
        allgenes = set();
        notfoundgenes =  set();

        out = []
        for index, row in data.iterrows():
            if row["Abundance"] == "Not detected":
                continue;
            allgenes.add(row['Gene'])
            if row['Gene'] in self.get_ensembl2gid_map:                
                if self.is_cellline(row["Sample"]):
                    continue;
                    
                content = None;                    
                content = row["Sample"] + "(" + row["Abundance"] + "|" + row["Unit"] + ":" + str(row["Value"]) + ")"
                for gid in self.get_ensembl2gid_map[row['Gene']]:
                    out.append({'gid':gid, 'content':content, 'annotation_field1':row['Gene']})                                                
            else:
                #print row['Gene']
                notfoundgenes.add(row['Gene'])
        
        print len(notfoundgenes), '/', len(allgenes), ' gene symbols (', float(len(notfoundgenes))/len(allgenes) if len(allgenes) != 0 else 1, ') cannot be converted to gene ids in ' + data_file;

        data=[]
           
        for k, g in pd.DataFrame(out).groupby(['gid']):   
            data.append({'gid':k, 'content':";".join(g['content'].tolist()), 'type_name': 'Protein_Atlas_RNA', 'annotation_field1':g['annotation_field1'].tolist()[0]})
                
        return data;
Ejemplo n.º 34
0
 def get_omim_term(self):
     #this file need account, mannually copy now.
     #urllib.urlretrieve("ftp://*****:*****@ftp.omim.org/OMIM/genemap2.txt", self.fn_dest_omim_term + '.tmp' )
     df_omim = util.read_csv(
         os.path.join(SyncDB.DOWNLOAD_DIR(), "geneMap2.txt"),
         sep=r'\t',
         comment='#',
         names=[
             'Chromosome', 'Genomic Position Start', 'Genomic Position End',
             'Cyto Location', 'Computed Cyto Location', 'Mim Number',
             'Gene Symbols', 'Gene Name', 'Approved Symbol',
             'Entrez Gene ID', 'Ensembl Gene ID', 'Comments', 'Phenotypes',
             'Mouse Gene Symbol/ID'
         ])
     util.rename2(
         df_omim, {
             'Mim Number': 'mim_num',
             'Gene Name': 'title',
             'Phenotypes': 'disorders',
             'Comments': 'comments'
         })
     # df_omim = df_omim[df_omim['Mim Number']==155600]
     # pprint(df_omim[:1].to_dict())
     df_omim[['mim_num', 'title', 'disorders',
              'comments']].to_csv(self.fn_dest_omim_term,
                                  sep=',',
                                  index=False)
Ejemplo n.º 35
0
    def make_gid2taxid(self):
        taxidList = SyncDB.SUPPORTED_SPECIES
        for child in self.children:
            if type(child).__name__ == "Species":
                taxidList = child.supported_species
                break
        if not os.path.exists(
                os.path.join(SyncDB.DOWNLOAD_DIR(), "geneid2taxid.csv")):
            taxid_filter = ""
            if len(taxidList) != 0:
                taxid_filter = ['$1==\"' + t + '\"' for t in taxidList]
                taxid_filter = "if (" + "||".join(taxid_filter) + ")"
            # gene_id,tax_id
            cmd = "time zcat " + SyncDB.DOWNLOAD_DIR(
            ) + "/gene_info.gz | cut -f1,2 | sed 1d | awk 'BEGIN{FS=\"\\t\"; OFS=\"\\t\"}{" + taxid_filter + " print $2,$1;}' | sort -k1,1 -t $'\\t' >" + SyncDB.DOWNLOAD_DIR(
            ) + "/geneid2taxid.csv"
            print cmd
            util.unix(cmd)

        df = util.read_csv(SyncDB.DOWNLOAD_DIR() + "/geneid2taxid.csv",
                           names=["gid", "tax_id"],
                           sep=r'\t')
        self.gid2taxid = {
            str(df.ix[i, 'gid']): str(df.ix[i, 'tax_id'])
            for i in df.index
        }
Ejemplo n.º 36
0
def write_verb_prefixes(upasargas, other, out_path):
    with util.read_csv(upasargas) as reader:
        upasargas = list(reader)

    with util.read_csv(other) as reader:
        other = list(reader)
        labels = reader.fieldnames

    assert 'prefix_type' in labels
    for x in upasargas:
        assert 'prefix_type' not in x
        x['prefix_type'] = 'upasarga'

    rows = sorted(upasargas + other, key=lambda x: util.key_fn(x['name']))
    with util.write_csv(out_path, labels) as write_row:
        for row in rows:
            write_row(row)
Ejemplo n.º 37
0
def write_prefixed_shs_verbal_data(data_path, prefixed_roots, root_converter,
                                   sandhi_rules, out_path):
    """Write Sanskrit Heritage Site data after converting its roots.

    :param data_path: path to the actual verb data
    :param out_path:
    """
    sandhi = make_sandhi_object(sandhi_rules)

    root_to_prefixed = {}
    with util.read_csv(prefixed_roots) as reader:
        for row in reader:
            root_to_prefixed.setdefault(row['unprefixed_root'], []).append(row)

    labels = None
    clean_rows = []
    with util.read_csv(data_path) as reader:
        for row in reader:
            root_pair = root_converter.get(row['root'])
            if root_pair is None:
                continue
            root, hom = root_pair

            for result in root_to_prefixed.get(root, []):
                new_row = row.copy()
                for field in ['form', 'stem']:
                    if field in row:
                        new_row[field] = sandhi.join(
                            result['prefixes'].split('-') + [new_row[field]])
                new_row['root'] = result['prefixed_root']
                new_row['hom'] = hom
                clean_rows.append(new_row)
        labels = reader.fieldnames + ['hom']

    old_rows = list(util.read_csv_rows(out_path))
    clean_rows.sort(key=lambda x: util.key_fn(x['root']))
    with util.write_csv(out_path, labels) as write_row:
        for row in old_rows:
            write_row(row)
        for row in clean_rows:
            write_row(row)
Ejemplo n.º 38
0
def main():
    with open("data/owner.csv", 'rb') as o:
        owner_table = list(unicodecsv.reader(o))
    with open("data/disputed.csv", 'rb') as d:
        disputed_table = list(unicodecsv.reader(d))

    dates = [row[0] for row in owner_table[1:]]
    codes = sorted(list(set.union(set(owner_table[0][1:]), set(disputed_table[0][1:]))))
    
    table = [[""] + codes] + [[date] + ["" for code in codes] for date in dates]

    owner = util.read_csv("data/owner.csv")
    disputed = util.read_csv("data/disputed.csv")

    #Tracer()()

    for di, date in enumerate(dates):
        for ci, code in enumerate(codes):
            o = find(owner, date, code)
            d = find(disputed, date, code)
            if o and d:
                if o == "-":
                    table[di+1][ci+1] = d
                    print d
                elif d == "-":
                    table[di+1][ci+1] = o
                    print o
                else:
                    raise Exception("{} - {}: {} vs. {}".format(date, code, o, d))
            elif o:
                table[di+1][ci+1] = o
                print o
            elif d:
                table[di+1][ci+1] = d
                print d

    with open("data/description.csv", 'wb') as csvfile:
        writer = unicodecsv.writer(csvfile, delimiter=",")
        for row in table:
            writer.writerow(row)
Ejemplo n.º 39
0
def write_prefixed_shs_verbal_indeclinables(final_path, sandhi_rules,
        prefixed_roots, root_converter, out_path):
    """Write prefixed SHS verbal indeclinables."""
    sandhi = make_sandhi_object(sandhi_rules)

    root_to_prefixed = {}
    with util.read_csv(prefixed_roots) as reader:
        for row in reader:
            root_to_prefixed.setdefault(row['unprefixed_root'], []).append(row)

    labels = None
    clean_rows = []
    with util.read_csv(final_path) as reader:
        for row in reader:
            root_pair = root_converter.get(row['root'])
            if root_pair is None:
                continue
            root, hom = root_pair

            row['root'] = root
            for result in root_to_prefixed.get(root, []):
                new_row = row.copy()
                for field in ['form', 'stem']:
                    if field in row:
                        new_row[field] = sandhi.join(
                            result['prefixes'].split('-') + [new_row[field]])
                new_row['root'] = result['prefixed_root']
                new_row['hom'] = result['hom']
                clean_rows.append(new_row)

        labels = reader.fieldnames

    labels += ['hom']
    old_rows = list(util.read_csv_rows(out_path))
    clean_rows.sort(key=lambda x: util.key_fn(x['root']))
    with util.write_csv(out_path, labels) as write_row:
        for row in old_rows:
            write_row(row)
        for row in clean_rows:
            write_row(row)
Ejemplo n.º 40
0
def awssqs():
    title = u'AWS SQS'
    if request.method == "POST":
        f = request.files.get('file')
        if f and allowed_file(f.filename):
            sendby = request.form.get('sendby')
            if sendby:
                sqs.add(sendby, read_csv(f))
                flash(u'丟到 AWS SQS {0}'.format(sendby))
            else:
                flash(u'錯誤選擇!')

        return redirect(url_for('awssqs'))
    else:
        return make_response(render_template('t_awssqs.htm', title=title,
                                             qlist=QUEUE_NAME_LIST, awssqs=1))
Ejemplo n.º 41
0
def send_all_first():
    title = u'Send All People First'
    if request.method == "POST":
        f = request.files.get('file')
        if f and allowed_file(f.filename):
            if request.form.get('sendby') == 'sqs':
                sqs.add(QUEUE_NAME_SENDFIRST, read_csv(f))
                flash(u'丟到 AWS SQS')

            elif request.form.get('sendby') == 'mail':
                #t.template = t.env.get_template('./coscup_first.htm')
                #t.sendall(read_csv(f), t.send_first)
                flash(u'寄送大量登錄信')

            else:
                flash(u'錯誤選擇!')

        return redirect(url_for('send_all_first'))
    else:
        return make_response(render_template('t_sendallfirst.htm',
                                             title=title, send_all_first=1))
Ejemplo n.º 42
0
 def get_changes(tag):
     source = util.read_csv('data/' + tag + '.csv')
     original_sources[tag] = source[NOW]
     del source[NOW]
     changes_map[tag] = source
Ejemplo n.º 43
0
import sklearn.cross_validation
import sklearn.linear_model
import sklearn.ensemble
import numpy as np
import util

headers, data = util.read_csv("mpg_data.csv")

# the first column of the data is the MPG, 
# which we want to use as our label
data = np.asarray(data).astype(float)

labels = data[:,0]
features = data[:,1:]


linear_reg = sklearn.linear_model.LinearRegression()

# cv=3 for three fold cross validation
scores = sklearn.cross_validation.cross_val_score(
    linear_reg, features, labels,
    cv=3, scoring="mean_absolute_error")

print(scores.mean()) 
# sklearn switches the sign on ASE so that 
# larger numbers are better




Ejemplo n.º 44
0
from sklearn.cross_validation import cross_val_score
import sklearn.linear_model
import sklearn.tree
import sklearn.ensemble
import numpy as np
from timeit import default_timer as timer
import util

headers, data = util.read_csv("hw_data.csv")

# the last column of the data is the 
# failure status, which we want to use as our label
data = np.asarray(data)

labels = data[:,-1]
features = data[:,:-1].astype(float)


# basic learners
clf1 = sklearn.linear_model.LogisticRegression() 
# even though it says regression, 
# this is for classification!

clf2 = sklearn.tree.DecisionTreeClassifier()

# bagging ensemble methods
clf3 = sklearn.ensemble.BaggingClassifier(
    sklearn.linear_model.LogisticRegression()
)

clf4 = sklearn.ensemble.BaggingClassifier(
Ejemplo n.º 45
0
def write_prefix_groups(prefixed_roots, unprefixed_roots, upasargas, other,
                        sandhi_rules, out_path):
    """Parse the prefixes in a prefix root and write out the prefix groups.

    The procedure is roughly as follows:

        for each prefixed root in `prefixed_roots`:
            find (p_1, ..., p_n, r), where p_x is a prefix and r is a root
            write the prefix group (p_1, ..., p_n) to file.

    We find (p_1, .., p_n) by using the rules in `sandhi_rules` and verify
    that `p_x` is a prefix by checking for membership in `upasargas` and
    `other`.
    """

    # Loading prefixes
    all_prefixes = set()
    with util.read_csv(upasargas) as reader:
        all_prefixes.update([x['name'] for x in reader])
    with util.read_csv(other) as reader:
        all_prefixes.update([x['name'] for x in reader])

    # The 's' prefix is used in roots like 'saMskf' and 'parizkf'. Although it
    # is prefixed to a verb, it is not semantically the same as the other verb
    # prefixes. Here, though, we treat it as a verb prefix.
    all_prefixes.add('s')

    # Some prefixes have alternate forms.
    prefix_alternates = {
        'pi': 'api',
        'ut': 'ud',
        'Ri': 'ni',
        'niz': 'nis',
        'iz': 'nis',
        'palA': 'parA',
        'pali': 'pari',
        'z': 's',
    }
    all_prefixes.update(prefix_alternates.keys())

    # Loading sandhi rules
    sandhi = make_sandhi_object(sandhi_rules)

    with util.read_csv(prefixed_roots) as reader:
        rows = []
        for row in reader:
            # Nibble away at `prefixed_root` until we have all prefixes for the
            # given root.
            prefixes = []
            prefixed_root = row['prefixed_root']
            unprefixed_root = row['unprefixed_root']
            last_letter = None

            q = Queue.PriorityQueue()
            for remainder in sandhi.split_off(prefixed_root, unprefixed_root):
                q.put_nowait((0, (), remainder))

            while not q.empty():
                _, cur_prefixes, remainder = q.get_nowait()

                # `remainder` is something we recognize: we're done!
                if remainder in all_prefixes:
                    prefixes = list(cur_prefixes)
                    if remainder:
                        prefixes.append(remainder)
                        last_letter = remainder[-1]
                    break

                for before, after in sandhi.splits(remainder):
                    # Prevent recursion. As of this comment, the `splits` method
                    # returns the non-split of some term X as (X, ''). In other
                    # words, this conditional will *never* be true. But since the
                    # behavior of various functions is still unsettled, this check
                    # will stay here for the time being.
                    if after == remainder:
                        continue

                    if before in all_prefixes:
                        state = (cur_prefixes + (before,), after)
                        cost = len(after)

                        # Incentivize short vowels. This avoids errors with roots
                        # like "upodgrah" ("upa-ud-grah"). Without the incentive,
                        # we could have "upa-A-ud-grah" instead.
                        if before and before[-1] in 'aiufx':
                            cost -= 1
                        q.put_nowait((cost,) + state)

            # Convert 'alternate' prefixes back to their original forms.
            prefixes = [prefix_alternates.get(x, x) for x in prefixes]
            if not prefixes:
                # Occurs if the root's prefix is unrecognized
                continue

            # We still don't know the prefix group. We can find it by splitting
            # off the root and keeping whatever matches `last_letter`.
            for group in sandhi.split_off(prefixed_root, unprefixed_root):
                if group[-1] == last_letter:
                    break
            prefix_string = '-'.join(prefixes)
            rows.append((group, prefix_string))

    labels = ['group', 'prefixes']
    with util.write_csv(out_path, labels) as write_row:
        for row in util.unique(rows):
            datum = dict(zip(labels, row))
            write_row(datum)
Ejemplo n.º 46
0
def make_sandhi_object(sandhi_rules_file):
    """Makes a Sandhi object for splitting and joining verb prefixes."""
    with util.read_csv(sandhi_rules_file) as reader:
        rules = [(x['first'], x['second'], x['result']) for x in reader]
        return S.Sandhi(rules + S.PREFIX_SANDHI_RULES)