def write_shs_verbal_indeclinables(adverbs_path, final_path, root_converter, out_path): """Write SHS verbal indeclinables.""" labels = None clean_rows = [] with util.read_csv(adverbs_path) as reader: for row in reader: root_pair = root_converter.get(row['root']) if root_pair is None: continue row['root'] = root_pair[0] row['hom'] = root_pair[1] clean_rows.append(row) with util.read_csv(final_path) as reader: for row in reader: root_pair = root_converter.get(row['root']) if root_pair is None: continue row['root'] = root_pair[0] row['hom'] = root_pair[1] # TODO: handle 'ya' gerunds if not row['form'].endswith('um'): continue clean_rows.append(row) labels = reader.fieldnames labels.insert(labels.index('root') + 1, 'hom') with util.write_csv(out_path, labels) as write_row: for row in clean_rows: write_row(row)
def ncbi_idmapping_data(self): if not os.path.exists(path.join(SyncDB.DOWNLOAD_DIR(), "gene2refseq.gz")): urllib.urlretrieve("ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz", os.path.join(SyncDB.DOWNLOAD_DIR(), "gene2refseq.gz")) if not os.path.exists(path.join(SyncDB.DOWNLOAD_DIR(), "gene2ensembl.gz")): urllib.urlretrieve("ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2ensembl.gz", os.path.join(SyncDB.DOWNLOAD_DIR(), "gene2ensembl.gz")) if not os.path.exists(path.join(SyncDB.DOWNLOAD_DIR(), "gene_refseq_uniprotkb_collab.gz")): urllib.urlretrieve("ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_refseq_uniprotkb_collab.gz", os.path.join(SyncDB.DOWNLOAD_DIR(), "gene_refseq_uniprotkb_collab.gz")) gene2ref=util.read_csv(SyncDB.DOWNLOAD_DIR()+'/gene2refseq.gz', sep='\t', header=False, skiprows=1, names=['Tax_ID', 'GeneID', 'status','Refseq_RNA','RNA_nucleotide_gi','Refseq_Prot','protein_gi','genomic_nucleotide_accession.version','genomic_nucleotide_gi','start_position_on_the_genomic_accession','end_position_on_the_genomic_accession','orientation','assembly','mature_peptide_accession.version','mature_peptide_gi','Symbol'])[['Tax_ID', 'GeneID', 'Refseq_RNA','Refseq_Prot','Symbol', 'status']].query('status != "SUPPRESSED" and Refseq_RNA != "-" and Tax_ID == [9606,10090,10116]').drop_duplicates(); gene2ref=gene2ref.iloc[np.where(gene2ref["GeneID"].notnull())[0]]; gene2ref=gene2ref.iloc[np.where(gene2ref["Refseq_RNA"].notnull())[0]]; gene2ref = gene2ref[gene2ref['Refseq_RNA'].str.contains("X")==False] #Remove computed Refseq ref2uniprot=util.read_csv(SyncDB.DOWNLOAD_DIR()+'/gene_refseq_uniprotkb_collab.gz', sep='\t', header=False, skiprows=1, names=['Refseq_Prot','Uniprot_ID']); ref2uniprot=ref2uniprot.iloc[np.where(ref2uniprot["Refseq_Prot"].notnull())[0]].drop_duplicates(subset='Refseq_Prot', take_last=True).fillna('-') gene2ens=util.read_csv(SyncDB.DOWNLOAD_DIR()+'/gene2ensembl.gz', sep='\t', header=False, skiprows=1, names=['Tax_ID','GeneID','Ensembl_Gene','Refseq_RNA','Ensembl_Trans','Refseq_Prot','Ensembl_Prot']).query('Tax_ID == [9606,10090,10116]')[['Ensembl_Gene','Refseq_RNA','Ensembl_Trans','Ensembl_Prot']]; gene2ens=gene2ens.iloc[np.where(gene2ens["Refseq_RNA"].notnull())[0]].drop_duplicates(subset='Refseq_RNA', take_last=True).fillna('-') f=lambda x: x.split('.')[0]; gene2ref['Refseq_Prot']=gene2ref['Refseq_Prot'].map(f); gene2ref['Refseq_RNA']=gene2ref['Refseq_RNA'].map(f); gene2ens['Refseq_RNA']=gene2ens['Refseq_RNA'].map(f); df = pd.merge(gene2ref, ref2uniprot, on='Refseq_Prot', how='left'); df = pd.merge(df, gene2ens, on='Refseq_RNA', how='left'); df["Source"] = "NCBI"; df=df.fillna('-') print "NCBI ID mapping data processed..." return df;
def write_mw_prefixed_roots(prefixed_roots, unprefixed_roots, prefix_groups, sandhi_rules, out_path): """Parse the prefixes in a prefix root and write the parsed roots.""" with util.read_csv(prefix_groups) as reader: prefix_groups = {x['group']: x['prefixes'] for x in reader} with util.read_csv(unprefixed_roots) as reader: root_set = {(x['root'], x['hom']) for x in reader} candidate_homs = [None] + [str(i) for i in range(1, 10)] sandhi = make_sandhi_object(sandhi_rules) rows = [] for row in util.read_csv_rows(prefixed_roots): for group in sandhi.split_off(row['prefixed_root'], row['unprefixed_root']): if group in prefix_groups: basis, hom = row['unprefixed_root'], row['hom'] if (basis, hom) not in root_set: for x in candidate_homs: if (basis, x) in root_set: hom = x break if (basis, hom) not in root_set: continue rows.append((row['prefixed_root'], prefix_groups[group], row['unprefixed_root'], hom)) break labels = ['prefixed_root', 'prefixes', 'unprefixed_root', 'hom'] with util.write_csv(out_path, labels) as write_row: for row in rows: write_row(dict(zip(labels, row)))
def get_jax_annotations (self): #urllib.urlretrieve("ftp://ftp.informatics.jax.org/pub/reports/HMD_HumanPhenotype.rpt", self.fn_dest_gene2phenotype) #urllib.urlretrieve("ftp://ftp.informatics.jax.org/pub/reports/VOC_MammalianPhenotype.rpt", self.fn_dest_reference) urllib.urlretrieve('http://www.informatics.jax.org/downloads/reports/HMD_HumanPhenotype.rpt', self.fn_dest_gene2phenotype) urllib.urlretrieve('http://www.informatics.jax.org/downloads/reports/VOC_MammalianPhenotype.rpt', self.fn_dest_reference) df_gene2phenotype = util.read_csv(self.fn_dest_gene2phenotype, names=['human_symbol', 'gid', 'homolo_gid', 'yes_no','mouse_marker', 'mgi_marker', 'phenotype_ids'], sep=r'\t', index_col=False); df_mgi_reference = util.read_csv(self.fn_dest_reference, names=['phenotype_id', 'name', 'description'], sep=r'\t', index_col=False); df_gene2phenotype['mgi_marker']= df_gene2phenotype['mgi_marker'].map(str.strip) df_mgi_reference['phenotype_id']= df_mgi_reference['phenotype_id'].map(str.strip) data = []; for index,r in df_gene2phenotype.iterrows(): if r['phenotype_ids']: for pid in r['phenotype_ids'].split(' '): data.append({'gid':r['gid'], 'phenotype_id':pid}) df_gene2phenotype = pd.DataFrame(data); df_join = pd.merge(df_gene2phenotype, df_mgi_reference, left_on='phenotype_id', right_on='phenotype_id', how='inner') data=[] for k,g in df_join.groupby('gid', as_index=False): data.append({'gid':k, 'content':'; '.join(g['name']), 'annotation_field1':'; '.join([x for x in g['description'] if x is not None]), 'tax_id':'9606'}) pd.DataFrame(data).to_csv(self.fn_dest_jax_annotations, index=False);
def get_ensembl2gid_df_web(self, tax_id, type): #mmusculus_gene_ensembl 10090 #rnorvegicus_gene_ensembl 10116 print "Get %s to gene id for %s" % (type, tax_id) attr = '<Attribute name = "' + type + '" />' db_name = self.get_dbname_by_taxid(tax_id) if db_name is None: return None fname = 'ensembl2gid_%s_%s' % (type, tax_id) util.unix('mkdir -p ' + SyncDB.DOWNLOAD_DIR() + '/ensembl_files') valid_files = [] for chr in self.chrList: ensembl_file = SyncDB.DOWNLOAD_DIR( ) + "/ensembl_files/%s_chr%s" % (fname, chr) print "downloading %s from %s for chr %s..." % (type, db_name, chr) if not os.path.exists(ensembl_file): cmd = 'wget -O ' + ensembl_file + ' \'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0"\ encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default"\ formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "' + db_name + '" interface = "default"><Filter name="chromosome_name" value="' + chr + '" filter_list=""/>' + attr + '<Attribute name = "entrezgene"\ /></Dataset></Query>\'' util.unix(cmd) try: tdf = util.read_csv(ensembl_file, sep="\t", header=None, names=['source_id', 'gid'], nrows=1) except Exception as exp: tdf = pd.DataFrame() if len(tdf) != 0: valid_files.append(ensembl_file) if len(valid_files) == 0: return None cmd = 'cat %s >> %s' % (' '.join(valid_files), SyncDB.DOWNLOAD_DIR() + "/ensembl_files/" + fname) print cmd util.unix(cmd) print "downloading %s from %s done." % (type, db_name) ensembl_data = util.read_csv(SyncDB.DOWNLOAD_DIR() + "/ensembl_files/" + fname, sep="\t", header=None, names=['source_id', 'gid']) ensembl_data = ensembl_data[ensembl_data['gid'].notnull()] ensembl_data = ensembl_data[ensembl_data['source_id'].notnull()] ensembl_data[['gid']] = ensembl_data[['gid']].astype(int) ensembl_data['tax_id'] = tax_id ensembl_data['type_name'] = type return ensembl_data
def make_root_converter(shs_roots_path, shs_blacklist_path, shs_override_path, mw_unprefixed_roots_path): """Returns a dict that maps SHS roots to MW roots. Specifically, the dict maps strings to a list of (root, hom) tuples. """ with util.read_csv(shs_blacklist_path) as reader: blacklist = {x['name'] for x in reader} with util.read_csv(shs_override_path) as reader: override = {x['shs']: x['mw'] for x in reader} # (root, class) -> [shs_root] class_pair_to_shs_roots = {} with util.read_csv(shs_roots_path) as reader: for row in reader: shs_root = row['root'] vclass = row['class'] clean_root = shs_root if shs_root in blacklist: clean_root = None elif shs_root in override: clean_root = override[shs_root] if clean_root is None: continue clean_root = shs_root.partition('#')[0] class_pair_to_shs_roots.setdefault((clean_root, vclass), set()).add(shs_root) assert len(class_pair_to_shs_roots.keys()) > 0 # (root, class) -> [(mw_root, hom)] class_pair_to_mw_roots = {} with util.read_csv(mw_unprefixed_roots_path) as reader: for row in reader: root, hom, vclass = row['root'], row['hom'], row['class'] class_pair_to_mw_roots.setdefault((root, vclass), []).append( (root, hom)) assert len(class_pair_to_mw_roots.keys()) > 0 # shs_root -> (mw_root, hom) converter = {} for shs_pair in class_pair_to_shs_roots: if shs_pair not in class_pair_to_mw_roots: continue shs_roots = class_pair_to_shs_roots[shs_pair] for shs_root in shs_roots: for mw_root in class_pair_to_mw_roots[shs_pair]: converter[shs_root] = mw_root assert len(converter.keys()) > 0 return converter
def make_root_converter(shs_roots_path, shs_blacklist_path, shs_override_path, mw_unprefixed_roots_path): """Returns a dict that maps SHS roots to MW roots. Specifically, the dict maps strings to a list of (root, hom) tuples. """ with util.read_csv(shs_blacklist_path) as reader: blacklist = {x['name'] for x in reader} with util.read_csv(shs_override_path) as reader: override = {x['shs']: x['mw'] for x in reader} # (root, class) -> [shs_root] class_pair_to_shs_roots = {} with util.read_csv(shs_roots_path) as reader: for row in reader: shs_root = row['root'] vclass = row['class'] clean_root = shs_root if shs_root in blacklist: clean_root = None elif shs_root in override: clean_root = override[shs_root] if clean_root is None: continue clean_root = shs_root.partition('#')[0] class_pair_to_shs_roots.setdefault((clean_root, vclass), set()).add(shs_root) assert len(class_pair_to_shs_roots.keys()) > 0 # (root, class) -> [(mw_root, hom)] class_pair_to_mw_roots = {} with util.read_csv(mw_unprefixed_roots_path) as reader: for row in reader: root, hom, vclass = row['root'], row['hom'], row['class'] class_pair_to_mw_roots.setdefault((root, vclass), []).append((root, hom)) assert len(class_pair_to_mw_roots.keys()) > 0 # shs_root -> (mw_root, hom) converter = {} for shs_pair in class_pair_to_shs_roots: if shs_pair not in class_pair_to_mw_roots: continue shs_roots = class_pair_to_shs_roots[shs_pair] for shs_root in shs_roots: for mw_root in class_pair_to_mw_roots[shs_pair]: converter[shs_root] = mw_root assert len(converter.keys()) > 0 return converter
def main(unused_argv): training_data, training_target = util.read_csv(TRAINING) testing_data, testing_target = util.read_csv(TESTING) training = util.DataSet(training_data, training_target) test = util.DataSet(testing_data, testing_target) x = tf.placeholder(tf.float32, [None, 9], name="x") y_ = tf.placeholder(tf.float32, [None, 1], name="y_") y_conv, keep_prob = deepnn(x) with tf.name_scope('loss'): cross_entropy = tf.square(y_conv - y_) cross_entropy = tf.reduce_mean(cross_entropy) with tf.name_scope('adam_optimizer'): train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) with tf.name_scope('accuracy'): correct_prediction = tf.equal(tf.cast(y_conv, tf.int32, name="output"), tf.cast(y_, tf.int32, name="target")) correct_prediction = tf.cast(correct_prediction, tf.float32) accuracy = tf.reduce_mean(correct_prediction, name="predict_op") saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for i in range(50000): batch = training.next_batch(50) if i % 1000 == 0: training_accuracy = cross_entropy.eval(feed_dict={ x: batch[0], y_: batch[1], keep_prob: 1.0 }) print('step %d, loss %g' % (i, training_accuracy)) train_step.run(feed_dict={ x: batch[0], y_: batch[1], keep_prob: 0.5 }) print('test loss %g' % cross_entropy.eval(feed_dict={ x: test._images, y_: test._labels, keep_prob: 1.0 })) saver.save(sess, "model_1")
def main(unused_argv): training_data, training_target = util.read_csv(TRAINING) testing_data, testing_target = util.read_csv(TESTING) training = util.DataSet(training_data, training_target) test = util.DataSet(testing_data, testing_target) x = tf.placeholder(tf.float32, [None, 4], name="x") y_ = tf.placeholder(tf.float32, [None, 3], name="y_") y_conv, keep_prob = deepnn(x) with tf.name_scope('loss'): cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv) cross_entropy = tf.reduce_mean(cross_entropy) with tf.name_scope('adam_optimizer'): train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) with tf.name_scope('accuracy'): correct_prediction = tf.equal(tf.argmax(y_conv, 1, name="output"), tf.argmax(y_, 1, name="target")) correct_prediction = tf.cast(correct_prediction, tf.float32) accuracy = tf.reduce_mean(correct_prediction, name="predict_op") saver = tf.train.Saver() with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess: sess.run(tf.global_variables_initializer()) for i in range(20000): batch = training.next_batch(20) arr = convertLabels(batch[1]) if i % 1000 == 0: training_accuracy = accuracy.eval(feed_dict={ x: batch[0], y_: arr, keep_prob: 1.0 }) print('step %d, training accuracy %g' % (i, training_accuracy)) train_step.run(feed_dict={x: batch[0], y_: arr, keep_prob: 0.5}) arr = convertLabels(test._labels) print('test accuracy %g' % accuracy.eval(feed_dict={ x: test._images, y_: arr, keep_prob: 1.0 })) saver.save(sess, "model_1")
def get_ensembl2gid_map_old(): if hasattr(GPUtils, 'ensembl2gid_map'): return GPUtils.ensembl2gid_map hgnc_file = path.join(SyncDB.DOWNLOAD_DIR(), "hgnc_complete_set.txt") ensembl_file = path.join(SyncDB.DOWNLOAD_DIR(), "hsapiens_gene_ensembl__gene__main.txt.gz") if not os.path.exists(hgnc_file): urllib.urlretrieve( "ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt", hgnc_file) if not os.path.exists(ensembl_file): urllib.urlretrieve(GPUtils.get_ensembl_mart_url(), ensembl_file) hgnc_data = util.read_csv(hgnc_file, sep="\t")[[ 'hgnc_id', 'symbol', 'ensembl_gene_id', 'entrez_id' ]] #Tracer()(); ensembl_data = util.read_csv(ensembl_file, header=None, skiprows=1, sep="\t")[[4, 5, 6]] hgnc_lookup = {} for i in hgnc_data.index: hgnc_lookup[hgnc_data.at[i, 'hgnc_id']] = hgnc_data.at[i, 'entrez_id'] out = [] lookup = {} for i in ensembl_data.index: hgnc_id = ensembl_data.at[i, 4] l = hgnc_id.find("HGNC:") if l < 0: #print "HGNC: was not found in ", hgnc_id; continue hgnc_id = hgnc_id[l:-1] if hgnc_id in hgnc_lookup: lookup[ensembl_data.at[i, 6]] = hgnc_lookup[hgnc_id] out.append({ 'ensembl_gene_id': ensembl_data.at[i, 6], 'gene_id': hgnc_lookup[hgnc_id] }) pd.DataFrame(out).sort(['ensembl_gene_id' ]).to_csv('ebi_ensembl_map.csv', index=False) #Tracer()() GPUtils.ensembl2gid_map = lookup return lookup
def get_variations(self): ensembl_file = SyncDB.DOWNLOAD_DIR( ) + "/ensembl_files/ensembl_variations.csv" print "Processing variations" if os.path.exists(ensembl_file): t = util.read_csv(ensembl_file) else: con = self.get_ensembl_connection( EnsemblDownload.get_ensembl_latest_version( 'homo_sapiens_variation_{0}_'.format( EnsemblDownload.ENSEMBL_VERSION))) query = "select distinct pf.object_id as variation_name,p.description,v.clinical_significance,vg.gene_name, s.name as source_name from source s, phenotype_feature pf, phenotype p, variation v, variation_genename vg where pf.type ='Variation' and pf.phenotype_id = p.phenotype_id and v.name=pf.object_id and v.variation_id=vg.variation_id and v.source_id=s.source_id and v.clinical_significance in ('likely pathogenic','pathogenic','risk factor','association','drug response')" t = db.from_sql(con, query, params=[]) t.to_csv(ensembl_file, index=False) map = GPUtils.get_sym2gid_map()["sym2gid"] data = [] for gene, row in t.groupby(['gene_name']): if gene in map: #Tracer()() content = [ '[' + r[1]['variation_name'] + '] ' + r[1]['description'] + '{' + r[1]['clinical_significance'] + '}(' + r[1]['source_name'] + ')' for r in row.iterrows() ] data.append({ 'gid': map[gene], 'content': ';'.join(content), 'annotation_field1': gene, 'type_name': 'VARIATIONS_ENSEMBL', 'tax_id': '9606' }) return data
def get_annotation_martdb(self, tax_id, a_type, is_boolean=False): import math db_name = self.get_dbname_by_taxid(tax_id) if db_name is None: return None file = SyncDB.DOWNLOAD_DIR() + "/ensembl_files/biomart_%s_%s.csv" % ( a_type, tax_id) print "Running query to get %s for %s from martdb" % (a_type, tax_id) query = self.get_annotation_mart_query(a_type, db_name) con = self.get_biomart_connection() try: if os.path.exists(file): df = util.read_csv(file) else: df = db.from_sql(con, query).drop_duplicates() df.to_csv(file, index=False) except: print "error in getting %s data for %s" % (a_type, tax_id) return None data = [] #Tracer()() for k, grow in df.groupby(['gid']): # Tracer()() cnt = [] for i in grow.index: v1 = grow.at[i, "term"] v2 = grow.at[i, "description"] try: if type(v1) is str or not math.isnan(v1): try: if type(v2) is str or not math.isnan(v2): cnt.append('[%s] %s' % (str(v1), str(v2))) else: cnt.append(str(v1)) except: cnt.append(str(v1)) except: pass cnt = pd.unique(cnt) content = '' if is_boolean: if len(cnt) > 0: content = "Yes" else: content = ';'.join(cnt) if content != '': data.append({ 'gid': k, 'content': content, 'annotation_field1': grow.at[i, 'gene'], 'type_name': a_type, 'tax_id': tax_id }) return data
def get_ensembl2gid_df_not_used(self, tax_id, type): print "Get %s to gene id for %s" % (type, tax_id) source_file_name = self.get_idmap_source_file(type, tax_id) if source_file_name is None: return None source_file_name = SyncDB.DOWNLOAD_DIR( ) + "/ensembl_files/" + source_file_name out_file = SyncDB.DOWNLOAD_DIR( ) + "/ensembl_files/" + 'ensembl2gid_%s_%s' % (type, tax_id) if not os.path.exists(source_file_name): urllib.urlretrieve( self.mart_ftp + "/" + self.get_idmap_source_file(type, tax_id), source_file_name) ensembl_data = util.read_csv(source_file_name, sep="\t", header=None, names=['source_id', 'gid']) ensembl_data = ensembl_data[ensembl_data['gid'].notnull()] ensembl_data = ensembl_data[ensembl_data['source_id'].notnull()] ensembl_data[['gid']] = ensembl_data[['gid']].astype(int) ensembl_data['tax_id'] = tax_id ensembl_data['type_name'] = type return ensembl_data
def ensembl_trans2gene_map_by_taxid(self, tax_id): if tax_id == 9606: db_name="hsapiens_gene_ensembl"; fname="ensembl_genes_human_trans2gid_map.csv"; elif tax_id == 10090: db_name="mmusculus_gene_ensembl"; fname="ensembl_genes_mouse_trans2gid_map.csv"; elif tax_id == 10116: db_name="rnorvegicus_gene_ensembl"; fname="ensembl_genes_rat_trans2gid_map.csv"; util.unix('mkdir -p ' + SyncDB.DOWNLOAD_DIR() + '/ensembl_files'); ensembl_file = SyncDB.DOWNLOAD_DIR() + "/ensembl_files/%s"%fname; if not os.path.exists(ensembl_file ): cmd = 'wget -O ' + ensembl_file + ' \'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0"\ encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default"\ formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "' + db_name + '" interface = "default">\ <Attribute name = "ensembl_gene_id"/><Attribute name = "ensembl_transcript_id"/><Attribute name = "ensembl_peptide_id"/>\ </Dataset></Query>\''; util.unix(cmd); df=util.read_csv(ensembl_file, sep='\t', names=['Ensembl_Gene','Ensembl_Trans','Ensembl_Prot']); df=df.iloc[np.where(df["Ensembl_Trans"].notnull())[0]] df=df.drop_duplicates(); df["Tax_ID"]=tax_id; return df;
def create_term_file(self): kwargs = {} if 'oldCols' in self.options: kwargs['names'] = self.options['oldCols'].split(',') if 'read_csv' in self.options: for kv_str in self.options['read_csv'].split(','): kv = kv_str.split('=') kwargs[kv[0]] = kv[1] if kv[1] == 'None': kwargs[kv[0]] = None if kv[0].lower() == 'skiprows': kwargs[kv[0]] = int(kv[1]) iter_csv = util.read_csv(self.fn_source, iterator=True, chunksize=self.get_chunksize(), dtype=str, **kwargs) term_id_col = 'term_id' if 'term_id' not in self.column_map else self.column_map[ 'term_id'] term_ids = [] for chunk in iter_csv: term_ids += util.unique(chunk[term_id_col]) term_ids = util.unique(term_ids) with open(self.fn_dest, "w") as myfile: wr = csv.writer(myfile) wr.writerow(['term_id', 'term_name', 'term_type']) wr.writerows([[ term_id, self.term_name if self.term_name else term_id, self.options['typeName'] ] for term_id in term_ids])
def instantiate_table_as_pandas_dataframe(self, table_name, **kwargs): #!!! Prolly should check to make sure cache exists. # commenting out until encoding issue fixed # if self.config['tables'][table_name]['cache']['type'] != 'flat_file': # raise ValueError("Table '+table_name+' cannot be instantiated, because it's not cached as a flat_file") # Replace from here... # filename = self.config['provenance']['data_filepath'] + \ # self.config['tables'][table_name]['cache']['filename'] # df = pd.read_csv(filename, **kwargs) # lnc_df = util.convert_pandas_dataframe_to_lncPandasDataFrame( # df, # self.config['tables'][table_name], # parent_pipeline=self # ) # ...to here. # Replacement code. # Try this soon... table_config = self.config["tables"][table_name] filename = self.config["provenance"]["data_filepath"] + table_config["cache"]["filename"] lnc_df = util.read_csv(filename, table_config, **kwargs) self.instantiated_tables[table_name] = lnc_df self.I = self.instantiated_tables # return df return lnc_df
def get_ensembl2gid_map_ensembl(): if hasattr(GPUtils, 'ensembl2gid_map_ensembl'): return GPUtils.ensembl2gid_map_ensembl ensembl_file = path.join(SyncDB.DOWNLOAD_DIR(), "ensembl_genes_info.csv") if not os.path.exists(ensembl_file): cmd = 'wget -O ' + ensembl_file + ' - \'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0"\ encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default"\ formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "hsapiens_gene_ensembl" interface = "default"\ ><Attribute name = "ensembl_gene_id"/><Attribute name = "entrezgene"/></Dataset></Query>\'' util.unix(cmd) ensembl_data = util.read_csv(ensembl_file, sep="\t", header=None, names=['ensembl_gene_id', 'gene_id']) ensembl_data = ensembl_data[ensembl_data['gene_id'].notnull()] ensembl_data[['gene_id']] = ensembl_data[['gene_id']].astype(int) GPUtils.ensembl2gid_map_ensembl = {} for i in ensembl_data.index: GPUtils.ensembl2gid_map_ensembl[ensembl_data.at[ i, 'ensembl_gene_id']] = GPUtils.ensembl2gid_map_ensembl.get( ensembl_data.at[i, 'ensembl_gene_id']) or [] GPUtils.ensembl2gid_map_ensembl[ensembl_data.at[ i, 'ensembl_gene_id']].append(ensembl_data.at[i, 'gene_id']) #Tracer()() return GPUtils.ensembl2gid_map_ensembl
def populate_product(self): """ Insert data into product table in the database """ df = read_csv(self.file_name) for idx, row in df.iterrows(): product_name = row['product_name'] bar_code = row['code'] url = row['url'] nutrition_grade = row['nutrition_grade_fr'] energy = row['energy'] proteins = row['proteins'] category_names = row['main_category'] store_names = row['stores_y'] new_product = Product(product_name=product_name, bar_code=bar_code, url=url, nutrition_grade=nutrition_grade, energy=energy, proteins=proteins) new_category = sess.query(Category).filter( Category.category_name == category_names).first() new_store = sess.query(Store).filter( Store.store_name == store_names).first() if new_store is None: new_store = Store(store_name=store_names) sess.add(new_store) new_product.categorys.append(new_category) new_product.stores.append(new_store) sess.add(new_product) sess.commit() sess.close()
def get_ensembl2gid_map_ncbi(): if hasattr(GPUtils, 'ensembl2gid_map_ncbi'): return GPUtils.ensembl2gid_map_ncbi if not os.path.exists( path.join(SyncDB.DOWNLOAD_DIR(), "gene2ensembl.gz")): urllib.urlretrieve( "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2ensembl.gz", os.path.join(SyncDB.DOWNLOAD_DIR(), "gene2ensembl.gz")) gene2ens = util.read_csv( os.path.join(SyncDB.DOWNLOAD_DIR(), 'gene2ensembl.gz'), header=None, skiprows=1, sep="\t", names=[ "tax_id", "GeneID", "Ensembl_gene_identifier", "RNA_nucleotide_accession.version", "Ensembl_rna_identifier", "protein_accession.version", "Ensembl_protein_identifier" ]).query('tax_id in [9606]')[['GeneID', 'Ensembl_gene_identifier']] gene2ens = gene2ens.drop_duplicates() GPUtils.ensembl2gid_map_ncbi = {} for i in gene2ens.index: GPUtils.ensembl2gid_map_ncbi[gene2ens.at[ i, 'Ensembl_gene_identifier']] = GPUtils.ensembl2gid_map_ncbi.get( gene2ens.at[i, 'Ensembl_gene_identifier']) or [] GPUtils.ensembl2gid_map_ncbi[gene2ens.at[ i, 'Ensembl_gene_identifier']].append(gene2ens.at[i, 'GeneID']) #Tracer()() return GPUtils.ensembl2gid_map_ncbi
def get_gene_pathway_map(self): print 'Getting GeneGo pathway data' #df = self.fetch("select distinct g17.ref as gid, m.MAPNAME as term_name, m.imid as term_id from genemaps gm, imagemap_table m, genedbs_17 g17, geneorgs go, genes g where gm.im=m.imid and gm.gene=g17.gene and go.gene = gm.gene and go.org=1 and g.geneid=go.gene"); fn = self.dir + "/pathway.csv" if not os.path.exists(fn): df = self.fetch( "select distinct i.imid as term_id, i.imagename url, i.mapname term_name, d.ref as gid, orgs.taxonomyid as tax_id from pw_imagemap_shapes s, pw_imagemap_class c, imagemap_table i, gene_netw n, genedbs d, geneorgs o, orgs where s.id=c.shape_id and i.imid=s.im and n.id=c.object_id and d.gene=n.gene and o.gene=n.gene and i.publish=1 and o.org=orgs.orgid and orgs.taxonomyid in (%s) and d.db=17" % (','.join(self.taxidList))) #Tracer()() df['TERM_ID'] = 'gMAP' + df['TERM_ID'].map(str) df.rename2({ "TERM_ID": "term_id", "TERM_NAME": "term_name", "GID": "gid", "URL": "term_field1", "TAX_ID": "tax_id" }) df['type_name'] = 'GeneGo Pathway' df.to_csv(fn, index=False) else: df = util.read_csv(fn) self.pathway_gid2term = df df2 = pd.DataFrame(df.copy())[['term_id', 'term_name', 'type_name']] df2['description'] = df2['term_name'] df2 = df2.drop_duplicates() self.pathway_terms = df2 self.pathway_done = True print 'GeneGo pathway data captured'
def get_ucsc2gid_df(self, tax_id): #mmusculus_gene_ensembl 10090 #rnorvegicus_gene_ensembl 10116 db_name = self.get_dsname_by_taxid(tax_id) if db_name is None: return pd.DataFrame() util.unix('mkdir -p ' + SyncDB.DOWNLOAD_DIR() + '/ucsc_files') file = SyncDB.DOWNLOAD_DIR() + "/ucsc_files/ucscid2gid_%s" % tax_id if not os.path.exists(file): con = db.get_con('UCSC') try: df = db.from_sql( con, "select name as source_id, value as gid from %s.knownToLocusLink" % db_name) except Exception as exp: return pd.DataFrame() df.to_csv(file, index=False) data = util.read_csv(file) data = data[data['gid'].notnull()] data = data[data['source_id'].notnull()] data[['gid']] = data[['gid']].astype(int) data['tax_id'] = tax_id return data
def get_co_authors(paper_csv): papers = read_csv(paper_csv, [ 'first_name', 'last_name', 'keys', 'valid', 'pub_key', 'pub_title', 'put_year', 'pub_authors' ]) papers_dic = {} for p in papers: a_id = int(p['id']) if a_id not in papers_dic: papers_dic[a_id] = {} papers_dic[a_id]['first_name'] = p['first_name'] papers_dic[a_id]['last_name'] = p['last_name'] papers_dic[a_id]['keys'] = set([p['keys']]) papers_dic[a_id]['pubs'] = [] papers_dic[a_id]['co-authors'] = {} a_dic = papers_dic[a_id] if p['valid']: pub = (p['pub_key'], p['pub_title']) a_dic['pubs'].append(pub) for co_a in p['pub_authors'].split(";"): if co_a not in a_dic['co-authors']: a_dic['co-authors'][co_a] = [pub] else: a_dic['co-authors'][co_a].append(pub) return papers_dic
def get_ensembl2gid_map_ebi(): if hasattr(GPUtils, 'ensembl2gid_map_epi'): return GPUtils.ensembl2gid_map_ebi hgnc_file = path.join(SyncDB.DOWNLOAD_DIR(), "hgnc_complete_set.txt") if not os.path.exists(hgnc_file): urllib.urlretrieve( "ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt", hgnc_file) hgnc_data = util.read_csv(hgnc_file, sep="\t")[[ 'hgnc_id', 'symbol', 'ensembl_gene_id', 'entrez_id' ]] #Tracer()(); GPUtils.ensembl2gid_map_ebi = {} for i in hgnc_data.index: GPUtils.ensembl2gid_map_ebi[hgnc_data.at[ i, 'ensembl_gene_id']] = GPUtils.ensembl2gid_map_ebi.get( hgnc_data.at[i, 'ensembl_gene_id']) or [] GPUtils.ensembl2gid_map_ebi[hgnc_data.at[ i, 'ensembl_gene_id']].append(hgnc_data.at[i, 'entrez_id']) #Tracer()() return GPUtils.ensembl2gid_map_ebi
def preprocess(self): #page_code와 label 불러오기 self.pcd, self.code2name = prep_pagecd(read_csv(cfg.pgcd)) self.lbl = read_csv(cfg.label) for date in self.date_ranges: print(f"### {date} 데이터 전처리 시작") merged_df = self._preprocess_per_mth(date) csvpath = os.path.join('s3://', cfg.data_dir, seqfname(date.year, date.month)) merged_df.to_csv(csvpath) print(f'### {date} 데이터 전처리 완료') print() return merged_df
def main(): # Reads from the data file and runs estimate for each row # Then plots the trajectory data_array = util.read_csv(config.DATASET_ABSOLUTE_PATH) row = data_array[0] time, encoder, angular_velocity, steering_angle = np.ravel(row) resulting_pos_heading = [] pose_estimator = PoseEstimator((0, 0, 0), time, encoder, angular_velocity, steering_angle) i = 1 while i < len(data_array): row = data_array[i] time, encoder, angular_velocity, steering_angle = np.ravel(row) x, y, heading = pose_estimator.estimate( time=time, steering_angle=steering_angle, encoder_ticks=encoder, angular_velocity=angular_velocity) resulting_pos_heading.append([x, y, heading]) i = i + 1 visualizer.plot_points( np.asarray(resulting_pos_heading)[:, 0], np.asarray(resulting_pos_heading)[:, 1]) visualizer.show()
def write_shs_verbal_data(data_path, root_converter, out_path): """Write Sanskrit Heritage Site data after converting its roots. :param data_path: path to the actual verb data :param blacklist_path: path to a list of blacklisted roots :param override_path: path to a map from SHS roots to MW roots. If a root isn't in this map, assume the SHS roots are just fine. :param out_path: """ labels = None clean_rows = [] with util.read_csv(data_path) as reader: for row in reader: root_pair = root_converter.get(row['root']) if root_pair is None: continue root, hom = root_pair row['root'] = root row['hom'] = hom clean_rows.append(row) labels = reader.fieldnames labels.insert(labels.index('root') + 1, 'hom') with util.write_csv(out_path, labels) as write_row: for row in clean_rows: write_row(row)
def main(): # Runs pipeline # accuracy_vs_binning(False) # Read in data from csv X, y = util.read_csv(FILE, normalize=True, mean_center=True, do_bin=False, bin_step=25) #plot_labels(y) # plot label frequency # plot_labels(y) # plot correlation # plot_correlation(X, y) # Partition data into train and test datasets X_train, y_train, X_test, y_test = partition(X, y) print(np.std(y_test)) # Uncomment below to test Random Forest # run_pipeline_rf(X_train, y_train, X_test, y_test) #Uncomment below to test sklearn FC run_pipeline_mlp(X_train, y_train, X_test, y_test)
def do_post_update(self): #The synonyms are concatenated with '|' for each gene. This post_update creates a separate record for each synonym. df = util.read_csv(self.fn_dest,dtype='S300') h = df.header() gid_col_name = h[0] synonyms_col_name = h[1] tax_id_col_name = h[2] rows = [] for i, r in df.iterrows(): for x in str(r[synonyms_col_name]).split('|'): import re m = re.search('([^:]*):(.*)',x) if m and len(m.groups())==2: db = m.group(1) xref = m.group(2) tax_id = r[tax_id_col_name] if db in ['MGI', 'WormBase','SGD','FLYBASE','ZFIN','RGD','Araport']: rows.append({ gid_col_name: r[gid_col_name], synonyms_col_name:xref, tax_id_col_name:tax_id, 'id_status':db }) df = pd.DataFrame(rows) df.to_csv(self.fn_dest, index=False)
def find_by_symbols(self, path): """ Helps finding subreddits for coins which cannot be found with the find_subreddits command. path: Path to file with a list of coin SYMBOLS (i.e. BTC, ETH, ...) """ a = util.read_csv(path) symbols = [s[0] for s in a] known_coin_name_array = util.read_subs_from_file(settings.general["subreddit_file"]) not_found, found = util.known_subs_for_symbols(known_coin_name_array, symbols) cap = CoinCap() coins = cap.get_coin_aliases(1000) coin_name_array = [] # try finding remaining coins for coin in coins: if coin[-1] in not_found: coin_name_array.append(coin) if(len(symbols) != len(coin_name_array)): log.info("No coin data for {} coins.".format(len(symbols) - len(coin_name_array))) coin_name_array = sorted(coin_name_array, key=lambda c: c[-1]) for coin_tuple in coin_name_array: coin_tuple.append("".join(x for x in coin_tuple[0] if x.isalnum())) subreddit_list = self.find_subreddits([name[-1] for name in coin_name_array]) for i,coin in enumerate(coin_name_array): coin[-1] = subreddit_list[i] return coin_name_array, found
def get_rna_expression(self): self.get_ensembl2gid_map=GPUtils.get_ensembl2gid_map(); #"Gene","Tissue","Cell type","Level","Expression type","Reliability" data_file = os.path.join(SyncDB.DOWNLOAD_DIR(),"protein_atlas/rna.csv.zip") urllib.urlretrieve("http://www.proteinatlas.org/download/rna.csv.zip", data_file) data = util.read_csv(data_file) allgenes = set(); notfoundgenes = set(); out = [] for index, row in data.iterrows(): if row["Abundance"] == "Not detected": continue; allgenes.add(row['Gene']) if row['Gene'] in self.get_ensembl2gid_map: if self.is_cellline(row["Sample"]): continue; content = None; content = row["Sample"] + "(" + row["Abundance"] + "|" + row["Unit"] + ":" + str(row["Value"]) + ")" for gid in self.get_ensembl2gid_map[row['Gene']]: out.append({'gid':gid, 'content':content, 'annotation_field1':row['Gene']}) else: #print row['Gene'] notfoundgenes.add(row['Gene']) print len(notfoundgenes), '/', len(allgenes), ' gene symbols (', float(len(notfoundgenes))/len(allgenes) if len(allgenes) != 0 else 1, ') cannot be converted to gene ids in ' + data_file; data=[] for k, g in pd.DataFrame(out).groupby(['gid']): data.append({'gid':k, 'content':";".join(g['content'].tolist()), 'type_name': 'Protein_Atlas_RNA', 'annotation_field1':g['annotation_field1'].tolist()[0]}) return data;
def get_omim_term(self): #this file need account, mannually copy now. #urllib.urlretrieve("ftp://*****:*****@ftp.omim.org/OMIM/genemap2.txt", self.fn_dest_omim_term + '.tmp' ) df_omim = util.read_csv( os.path.join(SyncDB.DOWNLOAD_DIR(), "geneMap2.txt"), sep=r'\t', comment='#', names=[ 'Chromosome', 'Genomic Position Start', 'Genomic Position End', 'Cyto Location', 'Computed Cyto Location', 'Mim Number', 'Gene Symbols', 'Gene Name', 'Approved Symbol', 'Entrez Gene ID', 'Ensembl Gene ID', 'Comments', 'Phenotypes', 'Mouse Gene Symbol/ID' ]) util.rename2( df_omim, { 'Mim Number': 'mim_num', 'Gene Name': 'title', 'Phenotypes': 'disorders', 'Comments': 'comments' }) # df_omim = df_omim[df_omim['Mim Number']==155600] # pprint(df_omim[:1].to_dict()) df_omim[['mim_num', 'title', 'disorders', 'comments']].to_csv(self.fn_dest_omim_term, sep=',', index=False)
def make_gid2taxid(self): taxidList = SyncDB.SUPPORTED_SPECIES for child in self.children: if type(child).__name__ == "Species": taxidList = child.supported_species break if not os.path.exists( os.path.join(SyncDB.DOWNLOAD_DIR(), "geneid2taxid.csv")): taxid_filter = "" if len(taxidList) != 0: taxid_filter = ['$1==\"' + t + '\"' for t in taxidList] taxid_filter = "if (" + "||".join(taxid_filter) + ")" # gene_id,tax_id cmd = "time zcat " + SyncDB.DOWNLOAD_DIR( ) + "/gene_info.gz | cut -f1,2 | sed 1d | awk 'BEGIN{FS=\"\\t\"; OFS=\"\\t\"}{" + taxid_filter + " print $2,$1;}' | sort -k1,1 -t $'\\t' >" + SyncDB.DOWNLOAD_DIR( ) + "/geneid2taxid.csv" print cmd util.unix(cmd) df = util.read_csv(SyncDB.DOWNLOAD_DIR() + "/geneid2taxid.csv", names=["gid", "tax_id"], sep=r'\t') self.gid2taxid = { str(df.ix[i, 'gid']): str(df.ix[i, 'tax_id']) for i in df.index }
def write_verb_prefixes(upasargas, other, out_path): with util.read_csv(upasargas) as reader: upasargas = list(reader) with util.read_csv(other) as reader: other = list(reader) labels = reader.fieldnames assert 'prefix_type' in labels for x in upasargas: assert 'prefix_type' not in x x['prefix_type'] = 'upasarga' rows = sorted(upasargas + other, key=lambda x: util.key_fn(x['name'])) with util.write_csv(out_path, labels) as write_row: for row in rows: write_row(row)
def write_prefixed_shs_verbal_data(data_path, prefixed_roots, root_converter, sandhi_rules, out_path): """Write Sanskrit Heritage Site data after converting its roots. :param data_path: path to the actual verb data :param out_path: """ sandhi = make_sandhi_object(sandhi_rules) root_to_prefixed = {} with util.read_csv(prefixed_roots) as reader: for row in reader: root_to_prefixed.setdefault(row['unprefixed_root'], []).append(row) labels = None clean_rows = [] with util.read_csv(data_path) as reader: for row in reader: root_pair = root_converter.get(row['root']) if root_pair is None: continue root, hom = root_pair for result in root_to_prefixed.get(root, []): new_row = row.copy() for field in ['form', 'stem']: if field in row: new_row[field] = sandhi.join( result['prefixes'].split('-') + [new_row[field]]) new_row['root'] = result['prefixed_root'] new_row['hom'] = hom clean_rows.append(new_row) labels = reader.fieldnames + ['hom'] old_rows = list(util.read_csv_rows(out_path)) clean_rows.sort(key=lambda x: util.key_fn(x['root'])) with util.write_csv(out_path, labels) as write_row: for row in old_rows: write_row(row) for row in clean_rows: write_row(row)
def main(): with open("data/owner.csv", 'rb') as o: owner_table = list(unicodecsv.reader(o)) with open("data/disputed.csv", 'rb') as d: disputed_table = list(unicodecsv.reader(d)) dates = [row[0] for row in owner_table[1:]] codes = sorted(list(set.union(set(owner_table[0][1:]), set(disputed_table[0][1:])))) table = [[""] + codes] + [[date] + ["" for code in codes] for date in dates] owner = util.read_csv("data/owner.csv") disputed = util.read_csv("data/disputed.csv") #Tracer()() for di, date in enumerate(dates): for ci, code in enumerate(codes): o = find(owner, date, code) d = find(disputed, date, code) if o and d: if o == "-": table[di+1][ci+1] = d print d elif d == "-": table[di+1][ci+1] = o print o else: raise Exception("{} - {}: {} vs. {}".format(date, code, o, d)) elif o: table[di+1][ci+1] = o print o elif d: table[di+1][ci+1] = d print d with open("data/description.csv", 'wb') as csvfile: writer = unicodecsv.writer(csvfile, delimiter=",") for row in table: writer.writerow(row)
def write_prefixed_shs_verbal_indeclinables(final_path, sandhi_rules, prefixed_roots, root_converter, out_path): """Write prefixed SHS verbal indeclinables.""" sandhi = make_sandhi_object(sandhi_rules) root_to_prefixed = {} with util.read_csv(prefixed_roots) as reader: for row in reader: root_to_prefixed.setdefault(row['unprefixed_root'], []).append(row) labels = None clean_rows = [] with util.read_csv(final_path) as reader: for row in reader: root_pair = root_converter.get(row['root']) if root_pair is None: continue root, hom = root_pair row['root'] = root for result in root_to_prefixed.get(root, []): new_row = row.copy() for field in ['form', 'stem']: if field in row: new_row[field] = sandhi.join( result['prefixes'].split('-') + [new_row[field]]) new_row['root'] = result['prefixed_root'] new_row['hom'] = result['hom'] clean_rows.append(new_row) labels = reader.fieldnames labels += ['hom'] old_rows = list(util.read_csv_rows(out_path)) clean_rows.sort(key=lambda x: util.key_fn(x['root'])) with util.write_csv(out_path, labels) as write_row: for row in old_rows: write_row(row) for row in clean_rows: write_row(row)
def awssqs(): title = u'AWS SQS' if request.method == "POST": f = request.files.get('file') if f and allowed_file(f.filename): sendby = request.form.get('sendby') if sendby: sqs.add(sendby, read_csv(f)) flash(u'丟到 AWS SQS {0}'.format(sendby)) else: flash(u'錯誤選擇!') return redirect(url_for('awssqs')) else: return make_response(render_template('t_awssqs.htm', title=title, qlist=QUEUE_NAME_LIST, awssqs=1))
def send_all_first(): title = u'Send All People First' if request.method == "POST": f = request.files.get('file') if f and allowed_file(f.filename): if request.form.get('sendby') == 'sqs': sqs.add(QUEUE_NAME_SENDFIRST, read_csv(f)) flash(u'丟到 AWS SQS') elif request.form.get('sendby') == 'mail': #t.template = t.env.get_template('./coscup_first.htm') #t.sendall(read_csv(f), t.send_first) flash(u'寄送大量登錄信') else: flash(u'錯誤選擇!') return redirect(url_for('send_all_first')) else: return make_response(render_template('t_sendallfirst.htm', title=title, send_all_first=1))
def get_changes(tag): source = util.read_csv('data/' + tag + '.csv') original_sources[tag] = source[NOW] del source[NOW] changes_map[tag] = source
import sklearn.cross_validation import sklearn.linear_model import sklearn.ensemble import numpy as np import util headers, data = util.read_csv("mpg_data.csv") # the first column of the data is the MPG, # which we want to use as our label data = np.asarray(data).astype(float) labels = data[:,0] features = data[:,1:] linear_reg = sklearn.linear_model.LinearRegression() # cv=3 for three fold cross validation scores = sklearn.cross_validation.cross_val_score( linear_reg, features, labels, cv=3, scoring="mean_absolute_error") print(scores.mean()) # sklearn switches the sign on ASE so that # larger numbers are better
from sklearn.cross_validation import cross_val_score import sklearn.linear_model import sklearn.tree import sklearn.ensemble import numpy as np from timeit import default_timer as timer import util headers, data = util.read_csv("hw_data.csv") # the last column of the data is the # failure status, which we want to use as our label data = np.asarray(data) labels = data[:,-1] features = data[:,:-1].astype(float) # basic learners clf1 = sklearn.linear_model.LogisticRegression() # even though it says regression, # this is for classification! clf2 = sklearn.tree.DecisionTreeClassifier() # bagging ensemble methods clf3 = sklearn.ensemble.BaggingClassifier( sklearn.linear_model.LogisticRegression() ) clf4 = sklearn.ensemble.BaggingClassifier(
def write_prefix_groups(prefixed_roots, unprefixed_roots, upasargas, other, sandhi_rules, out_path): """Parse the prefixes in a prefix root and write out the prefix groups. The procedure is roughly as follows: for each prefixed root in `prefixed_roots`: find (p_1, ..., p_n, r), where p_x is a prefix and r is a root write the prefix group (p_1, ..., p_n) to file. We find (p_1, .., p_n) by using the rules in `sandhi_rules` and verify that `p_x` is a prefix by checking for membership in `upasargas` and `other`. """ # Loading prefixes all_prefixes = set() with util.read_csv(upasargas) as reader: all_prefixes.update([x['name'] for x in reader]) with util.read_csv(other) as reader: all_prefixes.update([x['name'] for x in reader]) # The 's' prefix is used in roots like 'saMskf' and 'parizkf'. Although it # is prefixed to a verb, it is not semantically the same as the other verb # prefixes. Here, though, we treat it as a verb prefix. all_prefixes.add('s') # Some prefixes have alternate forms. prefix_alternates = { 'pi': 'api', 'ut': 'ud', 'Ri': 'ni', 'niz': 'nis', 'iz': 'nis', 'palA': 'parA', 'pali': 'pari', 'z': 's', } all_prefixes.update(prefix_alternates.keys()) # Loading sandhi rules sandhi = make_sandhi_object(sandhi_rules) with util.read_csv(prefixed_roots) as reader: rows = [] for row in reader: # Nibble away at `prefixed_root` until we have all prefixes for the # given root. prefixes = [] prefixed_root = row['prefixed_root'] unprefixed_root = row['unprefixed_root'] last_letter = None q = Queue.PriorityQueue() for remainder in sandhi.split_off(prefixed_root, unprefixed_root): q.put_nowait((0, (), remainder)) while not q.empty(): _, cur_prefixes, remainder = q.get_nowait() # `remainder` is something we recognize: we're done! if remainder in all_prefixes: prefixes = list(cur_prefixes) if remainder: prefixes.append(remainder) last_letter = remainder[-1] break for before, after in sandhi.splits(remainder): # Prevent recursion. As of this comment, the `splits` method # returns the non-split of some term X as (X, ''). In other # words, this conditional will *never* be true. But since the # behavior of various functions is still unsettled, this check # will stay here for the time being. if after == remainder: continue if before in all_prefixes: state = (cur_prefixes + (before,), after) cost = len(after) # Incentivize short vowels. This avoids errors with roots # like "upodgrah" ("upa-ud-grah"). Without the incentive, # we could have "upa-A-ud-grah" instead. if before and before[-1] in 'aiufx': cost -= 1 q.put_nowait((cost,) + state) # Convert 'alternate' prefixes back to their original forms. prefixes = [prefix_alternates.get(x, x) for x in prefixes] if not prefixes: # Occurs if the root's prefix is unrecognized continue # We still don't know the prefix group. We can find it by splitting # off the root and keeping whatever matches `last_letter`. for group in sandhi.split_off(prefixed_root, unprefixed_root): if group[-1] == last_letter: break prefix_string = '-'.join(prefixes) rows.append((group, prefix_string)) labels = ['group', 'prefixes'] with util.write_csv(out_path, labels) as write_row: for row in util.unique(rows): datum = dict(zip(labels, row)) write_row(datum)
def make_sandhi_object(sandhi_rules_file): """Makes a Sandhi object for splitting and joining verb prefixes.""" with util.read_csv(sandhi_rules_file) as reader: rules = [(x['first'], x['second'], x['result']) for x in reader] return S.Sandhi(rules + S.PREFIX_SANDHI_RULES)