def extract_zhang(file_name, type): names_to_types = {} loader = TableLoader() print file_name print "*" * 100 entries = loader.load("zhang/final/%s" % file_name) for entry in entries: # print entry # i+=1 # # if i == 10: # break name = entry["name"] if "/" in name: print name, "might not be a srna type" if not names_to_types.has_key(name): names_to_types[name] = type else: print "[warning] multiple entries of the same name" for key, value in names_to_types.items(): print "name: %s\t|\t" % key, "type: %s" % value
def extract_bilusic(file_name, type): names_to_types = {} loader = TableLoader() print file_name print "*" * 100 entries = loader.load("bilusic/final/%s" % file_name) for entry in entries: # print entry # i+=1 # # if i == 10: # break name = entry["name"] if not names_to_types.has_key(name): names_to_types[name] = type if "5'utr" in name: names_to_types[name] = "5'utr" elif "3'utr" in name: names_to_types[name] = "3'utr" else: print "[warning] multiple entries of the same name" for key, value in names_to_types.items(): print "name: %s\t|\t" % key, "type: %s" % value
def extract_our_gens(): files = ["assign-type-to-all-chimeras-of-Iron_limitation_CL_FLAG207_208_305_all_fragments_l25.txt_all_interactions.with-type", "assign-type-to-all-chimeras-of-Log_phase_CL_FLAG101-104_108_109_all_fragments_l25.txt_all_interactions.with-type", "assign-type-to-all-chimeras-of-MG_hfq-WT101_cutadapt_bwa.bam_all_fragments_l25.txt_all_interactions.with-type", "assign-type-to-all-chimeras-of-MG_hfq-wt202_CL_Stationary_cutadapt_bwa.bam_all_fragments_l25.txt_all_interactions.with-type", "assign-type-to-all-chimeras-of-Stationary_CL_FLAG209_210_312_all_fragments_l25.txt_all_interactions.with-type", "assign-type-to-signif-chimeras-of-Iron_limitation_CL_FLAG207_208_305_all_fragments_l25.txt_sig_interactions.with-type", "assign-type-to-signif-chimeras-of-Log_phase_CL_FLAG101-104_108_109_all_fragments_l25.txt_sig_interactions.with-type", "assign-type-to-signif-chimeras-of-Stationary_CL_FLAG209_210_312_all_fragments_l25.txt_sig_interactions.with-type", "assign-type-to-single-counts-of-Iron_limitation_CL_FLAG207_208_305_all_fragments_l25.txt_single_counts.with-type", "assign-type-to-single-counts-of-Log_phase_CL_FLAG101-104_108_109_all_fragments_l25.txt_single_counts.with-type", "assign-type-to-single-counts-of-MG_hfq-WT101_cutadapt_bwa.bam_all_fragments_l25.txt_single_counts.with-type", "assign-type-to-single-counts-of-MG_hfq-wt202_CL_Stationary_cutadapt_bwa.bam_all_fragments_l25.txt_single_counts.with-type", "assign-type-to-single-counts-of-Stationary_CL_FLAG209_210_312_all_fragments_l25.txt_single_counts.with-type"] names_to_types = {} loader = TableLoader() for file_name in files: i = 0 print file_name print "*" * 100 entries = loader.load("our_files/%s" % file_name) for entry in entries: # print entry # i+=1 # # if i == 10: # break name = entry["rna1 name"] if not names_to_types.has_key(name): names_to_types[name] = entry["first_type"] name = entry["rna2 name"] if not names_to_types.has_key(name): names_to_types[name] = entry["second_type"] for key, value in names_to_types.items(): print "name: %s\t|\t" % key, "type: %s" % value
def extract_our_gens(): files = [ "assign-type-to-all-chimeras-of-Iron_limitation_CL_FLAG207_208_305_all_fragments_l25.txt_all_interactions.with-type", "assign-type-to-all-chimeras-of-Log_phase_CL_FLAG101-104_108_109_all_fragments_l25.txt_all_interactions.with-type", "assign-type-to-all-chimeras-of-MG_hfq-WT101_cutadapt_bwa.bam_all_fragments_l25.txt_all_interactions.with-type", "assign-type-to-all-chimeras-of-MG_hfq-wt202_CL_Stationary_cutadapt_bwa.bam_all_fragments_l25.txt_all_interactions.with-type", "assign-type-to-all-chimeras-of-Stationary_CL_FLAG209_210_312_all_fragments_l25.txt_all_interactions.with-type", "assign-type-to-signif-chimeras-of-Iron_limitation_CL_FLAG207_208_305_all_fragments_l25.txt_sig_interactions.with-type", "assign-type-to-signif-chimeras-of-Log_phase_CL_FLAG101-104_108_109_all_fragments_l25.txt_sig_interactions.with-type", "assign-type-to-signif-chimeras-of-Stationary_CL_FLAG209_210_312_all_fragments_l25.txt_sig_interactions.with-type", "assign-type-to-single-counts-of-Iron_limitation_CL_FLAG207_208_305_all_fragments_l25.txt_single_counts.with-type", "assign-type-to-single-counts-of-Log_phase_CL_FLAG101-104_108_109_all_fragments_l25.txt_single_counts.with-type", "assign-type-to-single-counts-of-MG_hfq-WT101_cutadapt_bwa.bam_all_fragments_l25.txt_single_counts.with-type", "assign-type-to-single-counts-of-MG_hfq-wt202_CL_Stationary_cutadapt_bwa.bam_all_fragments_l25.txt_single_counts.with-type", "assign-type-to-single-counts-of-Stationary_CL_FLAG209_210_312_all_fragments_l25.txt_single_counts.with-type" ] names_to_types = {} loader = TableLoader() for file_name in files: i = 0 print file_name print "*" * 100 entries = loader.load("our_files/%s" % file_name) for entry in entries: # print entry # i+=1 # # if i == 10: # break name = entry["rna1 name"] if not names_to_types.has_key(name): names_to_types[name] = entry["first_type"] name = entry["rna2 name"] if not names_to_types.has_key(name): names_to_types[name] = entry["second_type"] for key, value in names_to_types.items(): print "name: %s\t|\t" % key, "type: %s" % value
def __init__(self, our_tables_list, article_tables_list): loader = OurTableLoader() self._our_tables_list = \ [loader.createTable(name, loader.loadUnprocessed(path)) for name, path in our_tables_list] loader = TableLoader() self._article_tables_list = [loader.createTable(name, loader.load(path)) for name, path in article_tables_list]
def generate_table_old(table_path, name, is_our_table=False): if not is_our_table: loader = TableLoader() table = loader.createTable(name, loader.load(table_path)) else: loader = OurTableLoader() table = loader.createTable(name, loader.loadUnprocessed(table_path)) db = MySQLdb.connect(host="localhost", user="******", db="article_refactor_24_3_2016") cur = db.cursor(MySQLdb.cursors.DictCursor) # Generate the keys for the table id_keys = [ TableGlobals.FIRST_START_BASE_KEY, TableGlobals.FIRST_END_BASE_KEY, TableGlobals.FIRST_STRAND_KEY, TableGlobals.SECOND_START_BASE_KEY, TableGlobals.SECOND_END_BASE_KEY, TableGlobals.SECOND_STRAND_KEY ] for key in table._dctData.values()[0].keys(): if key not in id_keys: id_keys.append(key) fields = ", ".join("%s VARCHAR(200)" % key.replace(" ", "_").replace( "-", "_").replace("'", "").replace("/", "") for key in id_keys if key != "") print fields cur.execute("CREATE TABLE %s (%s)" % (table.get_name(), fields)) table_as_list = [] # Generate dictionary for each row according to the keys for key, value in table: id_values = key.split(Table.ID_DELIMITER) for id_key, id_val in zip(id_keys, id_values): value[id_key] = str(id_val) table_as_list.append(value) # print "-->", value # Go over the rows and add them to the db for row in table_as_list: values = ",".join("%s" % db.literal(str(row[key])) for key in id_keys if key != "") # print "INSERT INTO %s VALUES (%s)" % (table.get_name(), values) cur.execute("INSERT INTO %s VALUES (%s)" % (table.get_name(), values)) db.commit()
def generate_table_old(table_path, name, is_our_table=False): if not is_our_table: loader = TableLoader() table = loader.createTable(name, loader.load(table_path)) else: loader = OurTableLoader() table = loader.createTable(name, loader.loadUnprocessed(table_path)) db=MySQLdb.connect(host="localhost",user="******",db="article_refactor_24_3_2016") cur = db.cursor(MySQLdb.cursors.DictCursor) # Generate the keys for the table id_keys = [TableGlobals.FIRST_START_BASE_KEY, TableGlobals.FIRST_END_BASE_KEY, TableGlobals.FIRST_STRAND_KEY, TableGlobals.SECOND_START_BASE_KEY, TableGlobals.SECOND_END_BASE_KEY, TableGlobals.SECOND_STRAND_KEY] for key in table._dctData.values()[0].keys(): if key not in id_keys: id_keys.append(key) fields = ", ".join("%s VARCHAR(200)" % key.replace(" ", "_").replace("-", "_").replace("'", "").replace("/", "") for key in id_keys if key != "") print fields cur.execute("CREATE TABLE %s (%s)" % (table.get_name(), fields)) table_as_list = [] # Generate dictionary for each row according to the keys for key, value in table: id_values = key.split(Table.ID_DELIMITER) for id_key, id_val in zip(id_keys, id_values): value[id_key] = str(id_val) table_as_list.append(value) # print "-->", value # Go over the rows and add them to the db for row in table_as_list: values = ",".join("%s" % db.literal(str(row[key])) for key in id_keys if key != "") # print "INSERT INTO %s VALUES (%s)" % (table.get_name(), values) cur.execute("INSERT INTO %s VALUES (%s)" % (table.get_name(), values)) db.commit()
def lybecker_update(file_name, show_warnings=True, overlap_delimiter="/", overlap_field="annotation of overlapping genes", adjacent_field="adjacent genes", loader_type=LybeckerS2TableLoader): geneLoader = GeneTableLoader() gene_table = geneLoader.createTable("genes", geneLoader.loadUnprocessed("./genes.col")) loader = loader_type() table = loader.createTable("lybecker", loader.load("lybecker/final/%s" % file_name)) new_table_raw = [] for id, info in table: dct = {} info.pop(Table.UNIQUE_ID_FIELD) start, end, strand = id.split(";")[:3] # print info["name"] start = int(start) end = int(end) result = gene_table.is_overlaps(start, end, "none") if show_warnings: print id overlaps = [gene for gene in info[overlap_field].split(overlap_delimiter) if gene != ""] print overlaps for index, gene in enumerate(result): print "%d: %s" % (index, gene[1][1]["name"]) if gene[1][1]["name"] not in overlaps: print "[warning] older name is being used" if len(overlaps) > len(result): print "[warning] missing overlapping gene" if len(overlaps) < len(result): print "[warning] extra overlapping gene" # Set the record location dct[TableGlobals.FIRST_START_BASE_KEY], dct[TableGlobals.FIRST_END_BASE_KEY], \ dct[TableGlobals.FIRST_STRAND_KEY], dct[TableGlobals.SECOND_START_BASE_KEY], \ dct[TableGlobals.SECOND_END_BASE_KEY], dct[TableGlobals.SECOND_STRAND_KEY] = id.split(Table.ID_DELIMITER) # Assume unknown strand dct[TableGlobals.FIRST_STRAND_KEY] = "none" dct[TableGlobals.SECOND_STRAND_KEY] = "none" is_valid = result[0][0] overlap_names = info[overlap_field].split(overlap_delimiter) # Check if major in overlapping names if not is_valid: is_valid = False # print "major names", [gene[1][1]["name"] for gene in result] for first in [gene[1][1]["name"] for gene in result]: for second in overlap_names: if first in second: is_valid = True break if is_valid: break # check if minor in overlapping names if not is_valid: is_valid = False other_names = [] for gene in result: other_names.extend(gene[1][1]["other_names"]) # print "old names", other_names for first in other_names: for second in overlap_names: if first in second: is_valid = True break if is_valid: break if "intergenic" == info["category"]: is_valid = False adjacent_genes = [val for val in info[adjacent_field].split(overlap_delimiter) if val != ""] first = gene_table.findByName(adjacent_genes[0]) second = gene_table.findByName(adjacent_genes[1]) if first == (None, None): first = gene_table.findByOtherNames(adjacent_genes[0]) if second == (None, None): second = gene_table.findByOtherNames(adjacent_genes[1]) if first != (None, None): representing = first elif second != (None, None): representing = second else: raise BaseException("No presenting gene found") strand = representing[0].split(Table.ID_DELIMITER)[2] if strand == TableGlobals.STRAND_POSITIVE: dct[TableGlobals.FIRST_STRAND_KEY] = TableGlobals.STRAND_NEGATIVE dct[TableGlobals.SECOND_STRAND_KEY] = TableGlobals.STRAND_NEGATIVE else: dct[TableGlobals.FIRST_STRAND_KEY] = TableGlobals.STRAND_POSITIVE dct[TableGlobals.SECOND_STRAND_KEY] = TableGlobals.STRAND_POSITIVE if is_valid: is_valid = "divergent" not in info["category"] and \ "convergent" not in info["category"] # Update the record name if gene was found if is_valid: info["name"] = "overlapping_" else: name_id = id.split(Table.ID_DELIMITER)[:2] name_id.append(dct[TableGlobals.FIRST_STRAND_KEY]) info["name"] = "lybecker_%s_%s" % (info["category"], Table.ID_DELIMITER.join(name_id)) pos_count = 0 neg_count = 0 if is_valid: # for each gene match for entry in result: # exact match add name info["name"] += "%s." % entry[1][1]["name"] strand = entry[1][0].split(Table.ID_DELIMITER)[2] if TableGlobals.STRAND_NEGATIVE == strand: neg_count += 1 if TableGlobals.STRAND_POSITIVE == strand: pos_count += 1 if neg_count == 0: dct[TableGlobals.FIRST_STRAND_KEY] = TableGlobals.STRAND_NEGATIVE dct[TableGlobals.SECOND_STRAND_KEY] = TableGlobals.STRAND_NEGATIVE elif pos_count == 0: dct[TableGlobals.FIRST_STRAND_KEY] = TableGlobals.STRAND_POSITIVE dct[TableGlobals.SECOND_STRAND_KEY] = TableGlobals.STRAND_POSITIVE else: print "no strand match: %s" % entry[1][1]["name"] # remove extra . from name if match found if is_valid: info["name"] = info["name"][:-1] dct.update(info) new_table_raw.append(dct) if show_warnings: print 20 * "*" # for row in new_table_raw: # print row TableLoader().createTable("updated_lybecker", new_table_raw).dump("lybecker/final/updated_%s" % file_name)
def generate_zhang_stats(): loader = TableLoader() rows_as_dictionary = loader.load("output/table_s6_range_0.csv") row_list = [] header = ["name", "il_rna2_percent", "stat_rna2_percent", "log_rna2_percent", "k31_ip", # distal "r16a_ip", # rim "q8a_ip", # proximal "k31", "r16a", "q8a", "average_percent"] for row in rows_as_dictionary: new_row = [row["name"], row["signif_chimeras_of_iron_limitation_cl.as_rna2_percentage"].replace("-", ""), row["signif_chimeras_of_stationary_cl.as_rna2_percentage"].replace("-", ""), row["signif_chimeras_of_log_phase_cl.as_rna2_percentage"].replace("-", "")] matches = get_zhang_stats_by_name(row["name"]) if len(matches) > 1: print "warning too many results" elif len(matches) == 1: new_row.extend(val for val in matches[0]) # print new_row else: new_row.extend([""] * 6) average_percent = 0.0 fields = 0 if row["signif_chimeras_of_iron_limitation_cl.as_rna2_percentage"] != "-": average_percent += float(row["signif_chimeras_of_iron_limitation_cl.as_rna2_percentage"]) fields += 1 if row["signif_chimeras_of_stationary_cl.as_rna2_percentage"] != "-": average_percent += float(row["signif_chimeras_of_stationary_cl.as_rna2_percentage"]) fields += 1 if row["signif_chimeras_of_log_phase_cl.as_rna2_percentage"] != "-": average_percent += float(row["signif_chimeras_of_log_phase_cl.as_rna2_percentage"]) fields += 1 average_percent /= fields new_row.append(average_percent) row_list.append(new_row) # for row in row_list: # print row fl = open("zhang_stats.csv", "wb") fl.write("%s\n" % "\t".join(header)) for row in row_list: fl.write("%s\n" % "\t".join(str(val) for val in row)) fl.close()
def generate_zhang_stats(): loader = TableLoader() rows_as_dictionary = loader.load("output/table_s6_range_0.csv") row_list = [] header = [ "name", "il_rna2_percent", "stat_rna2_percent", "log_rna2_percent", "k31_ip", # distal "r16a_ip", # rim "q8a_ip", # proximal "k31", "r16a", "q8a", "average_percent" ] for row in rows_as_dictionary: new_row = [ row["name"], row["signif_chimeras_of_iron_limitation_cl.as_rna2_percentage"]. replace("-", ""), row["signif_chimeras_of_stationary_cl.as_rna2_percentage"].replace( "-", ""), row["signif_chimeras_of_log_phase_cl.as_rna2_percentage"].replace( "-", "") ] matches = get_zhang_stats_by_name(row["name"]) if len(matches) > 1: print "warning too many results" elif len(matches) == 1: new_row.extend(val for val in matches[0]) # print new_row else: new_row.extend([""] * 6) average_percent = 0.0 fields = 0 if row["signif_chimeras_of_iron_limitation_cl.as_rna2_percentage"] != "-": average_percent += float( row["signif_chimeras_of_iron_limitation_cl.as_rna2_percentage"] ) fields += 1 if row["signif_chimeras_of_stationary_cl.as_rna2_percentage"] != "-": average_percent += float( row["signif_chimeras_of_stationary_cl.as_rna2_percentage"]) fields += 1 if row["signif_chimeras_of_log_phase_cl.as_rna2_percentage"] != "-": average_percent += float( row["signif_chimeras_of_log_phase_cl.as_rna2_percentage"]) fields += 1 average_percent /= fields new_row.append(average_percent) row_list.append(new_row) # for row in row_list: # print row fl = open("zhang_stats.csv", "wb") fl.write("%s\n" % "\t".join(header)) for row in row_list: fl.write("%s\n" % "\t".join(str(val) for val in row)) fl.close()
def format_final_table(path, our_tables, output_file): sets = { "Raghavan et al 2011": ["raghavan_s5", "raghavan_s6", "raghavan_s7", "raghavan_2"], "Lybecker et al 2014": ["lybecker_s1", "lybecker_s2"], "Bilusic et al 2014": [ "bilusic_s1", "bilusic_s2", "bilusic_s3_1", "bilusic_s3_2", "bilusic_s4_1", "bilusic_s4_2" ] # "zhang": ["zhang_s3_2013_sheet2008", "zhang_s3_2013_sheet2009", "zhang_s4_2013_sheet2008", "zhang_s4_2013_sheet2009"], } mcdowell_set = {"McDowall et al 2014": ["mcdowell"]} thomason_set = { "Thomason et al 2015": [ "thomason", "thomason_primary", "thomason_secondary", "thomason_internal", "thomason_antisense", "thomason_putative_asrna" ] } conditions = [ "signif_chimeras_of_iron_limitation_cl", "signif_chimeras_of_log_phase_cl", "signif_chimeras_of_stationary_cl" ] conditions_beauty_names = ["Iron limitation", "Log", "Stationary"] short_name = { "raghavan_s5": "R1", "raghavan_s6": "R2", "raghavan_s7": "R3", "raghavan_2": "R4", # "raghavan_s8": "R4", "lybecker_s1": "L1", "lybecker_s2": "L2", "bilusic_s1": "B1", "bilusic_s2": "B2", "bilusic_s3_1": "B3_1", "bilusic_s3_2": "B3_2", "bilusic_s4_1": "B4_1", "bilusic_s4_2": "B4_2", # "zhang_s3_2013_sheet2008": "Z1", # "zhang_s3_2013_sheet2009": "Z2", # "zhang_s4_2013_sheet2008": "Z3", # "zhang_s4_2013_sheet2009": "Z4", "thomason": "T1", "thomason_primary": "T1_1", "thomason_secondary": "T1_2", "thomason_internal": "T1_3", "thomason_antisense": "T1_4", "thomason_putative_asrna": "T1_5", "mcdowell": "M1" } beauty_type_names = { "3utr": "3UTR", "5utr": "5UTR", "as": "AS", "cis_as_with_trans_t": "cASt", "igr": "IGR", "mrna": "CDS", "other-ncrna": "oRNA", "srna": "sRNA", "trna": "tRNA", "tu": "IGT" } loader = TableLoader() results = loader.load(path) header = [ "Name", "EcoCyc id", "Type", "Total UI", "sRNA UI", "CDS & 5'UTR UI", "3'UTR & IGR UI" ] start_of_total_interactions = len(header) for cond_name in conditions_beauty_names: header.append("TNR %s" % cond_name) end_of_total_interactions = len(header) start_of_interactions = end_of_total_interactions for cond_name in conditions_beauty_names: header.append("Fraction as RNA2 %s" % cond_name) end_of_interactions = len(header) header.extend([ "Longest U tract", "MEME E-value", "MAST P-value", "Meme motif", "Total number of targets", "Number of targets with motif", "Overlaps known binding site" ]) start_of_regular_tables = len(header) for set_name in sets: header.append(set_name) end_of_regular_tables = len(header) for set_name in thomason_set: header.append(set_name) end_of_thomason_tables = len(header) for set_name in mcdowell_set: header.append(set_name) end_of_tables = len(header) header.append("# of supporting papers") final_rows = [] # Go over the rows and fill according to the header for index, row in enumerate(results): row_values = [ row["name"], row["ecocyc_id"], row["type"], row["total_targets"], row["tb_srna_targets"], row["mrna_5utr_targets"], row["igr_3utr_targets"] ] # Total interactions for field in header[ start_of_total_interactions:end_of_total_interactions]: cond_name = get_condition_by_header_name(field, conditions, conditions_beauty_names) first_count = int(row["%s_first_interactions" % cond_name]) second_count = int(row["%s_second_interactions" % cond_name]) row_values.append(first_count + second_count) # interactions percentage for field in header[start_of_interactions:end_of_interactions]: cond_name = get_condition_by_header_name(field, conditions, conditions_beauty_names) first_count = float(row["%s_first_interactions" % cond_name]) second_count = float(row["%s_second_interactions" % cond_name]) total_count = first_count + second_count if float(total_count) == 0: res = "-" else: res = second_count / total_count res = "%.2f" % res row_values.append(res) row_values.extend([ row["max_poly_u_length"], row["meme"].upper(), row["mast"].upper(), row["motif"], row["total_number_of_targets"], row["number_of_targets_with_motif"], row["binding_site_state"] ]) total_articles = 0 # Go over the table hit fields and merge columns for set_name in header[start_of_regular_tables:end_of_regular_tables]: field_values = [] for cond_name in conditions: for table in sets[set_name]: if row["%s_%s" % (table, cond_name)] == "+": field_values.append(short_name[table]) elif row["%s_%s" % (table, cond_name)] == "-": continue else: print "[warning] invalid value for cell" if len(field_values) > 0: total_articles += 1 row_values.append(";".join(list(set(field_values)))) # Go over the table hit fields and merge columns - for thomason for set_name in header[end_of_regular_tables:end_of_thomason_tables]: field_values = [] for table in thomason_set[set_name]: if row[table] == "+": field_values.append(short_name[table]) elif row[table] == "-": continue else: print "[warning] invalid value for cell" if len(field_values) > 0: total_articles += 1 row_values.append(";".join(list(set(field_values)))) # Go over the table hit fields and merge columns - for mcdowell for set_name in header[end_of_thomason_tables:end_of_tables]: field_values = [] for table in mcdowell_set[set_name]: if row[table] == "+": field_values.append(short_name[table]) elif row[table] == "-": continue else: print "[warning] invalid value for cell" if len(field_values) > 0: total_articles += 1 row_values.append(";".join(list(set(field_values)))) row_values.append(total_articles) final_rows.append(row_values) # go over the rows and fix name, id, type notations names = get_name_dictionary(our_tables) ecocyc_ids = get_ecocyc_id_dictionary(our_tables) for row in final_rows: row[0] = names[row[0]] row[1] = ecocyc_ids[row[1]] row[2] = beauty_type_names[row[2]] if row[1].split(".")[-1].isdigit(): row[1] = ".".join(row[1].split(".")[:-1]) if ".TU" in row[0]: row[0] = row[0].replace(".TU", ".IGT") if ".TU" in row[1]: row[1] = row[1].replace(".TU", ".IGT") with open(output_file, "wb") as fl: fl.write("%s\n" % "\t".join(header)) for row in final_rows: fl.write("%s\n" % "\t".join(str(val) for val in row))