def parse_dups(dups_file, flat): #####THIS ONLY WORKS IF WE CHANGE QUOTA flat.fill_dict() dup_dic = {} seen = [] for line in open(dups_file): line = line.strip().split("\t") parent = line[0] dups = line[1:] all = [Bed.row_to_dict(flat.d[f]) for f in list(set(line))] all.sort(key=operator.itemgetter('start')) dup_start = all[0] dup_end = all[-1] dup_dic[parent] = 'P' seen += [parent] for dup in dups: if dup in seen: continue seen.append(dup) dup_dic[dup] = parent # so here, there are all the genes that arent part of the local dup # array, but we want to mark them with 'I' intervening = flat.get_features_in_region(dup_start['seqid'], dup_start['start'], dup_end['end']) for ii in intervening: if ii['accn'] == parent or ii['accn'] == dup_end: continue if not ii['accn'] in dup_dic.keys(): dup_dic[ii['accn']] = 'I' return dup_dic
def write_genelist(q_or_s, outfile, flat, pairs, orthos, mcnss, link_fmt, this_org, other_org, other_flat, dups, local_dups): # used in the link_fmt qorg, sorg = this_org, other_org fmt = "%(accn)s\t%(seqid)s\t%(start)i\t%(end)i\t%(ortholog)s\t%(ortho_cns)s\t" fmt +="%(regional_dup_info)s\t%(local_dup_info)s\t%(strand)s\t" fmt += "%(new_gene_info)s\t%(link)s" header = fmt.replace('%(', '').replace(')s','').replace(')i','') outdir = op.dirname(flat.path) annos = dict([kv.rstrip().split(",") for kv in open("%s/%s_protein_rna.anno" % (outdir, q_or_s))]) if flat.path == other_flat.path: annos.update(dict([kv.rstrip().split(",") for kv in open("%s/s_protein_rna.anno" % (outdir,))])) out = open(outfile, 'w') print >>sys.stderr, "writing genelist to %s" % (outfile,) print >>out, header.replace('ortho_', other_org + '_') same_org = this_org == other_org for feat in flat: these_pairs = pairs.get(feat['accn'], []) cnss = mcnss.get(feat['accn'], []) ortholog, other_pairs = split_pairs(feat, [other_flat.d[t] for t in these_pairs], orthos, q_or_s=='s') ortho_cns, non_ortho_cns = split_cns(cnss, orthos, q_or_s=='s') regional_dup_info = dups.get(feat['accn'], '') local_dup_info = local_dups.get(feat['accn'], '') if ortholog: ortho = ortholog[0] link = link_fmt % dict(qorg=qorg, sorg=sorg, accn1=ortho['accn'], accn2=feat['accn'] ) else: link = '' new_gene_info = "" if feat['accn'].endswith(("_cns_protein", "_cns_rna")): try: new_gene_info = annos[feat['accn']] except KeyError: # from coannoation of previous run. pass ortholog = len(ortholog) and ",".join([o["accn"] for o in ortholog]) or "" if len(ortho_cns) > 0 and len(ortholog) == 0: print >>sys.stderr, "\nBAD", feat, "\n", ortho_cns, "\nthese:", these_pairs, "\nother:", other_pairs, "\n\n" # fell right on the edge of a syntenic block. the cns got in, but not the gene. #1/0 other_pairs = ",".join([o["accn"] for o in other_pairs]) fmt_dict = locals() fmt_dict.update(Bed.row_to_dict(feat)) fmt_dict.update({'ortho_cns': len(ortho_cns) if ortholog else "", 'ortho_NON_cns_count': len(non_ortho_cns) if other_pairs else ""}) print >>out, fmt % fmt_dict