def main(): # parse command line options usage = "%prog [options] family_accession_file" opt_parser = OptionParser(usage=usage) opt_parser.add_option("-s", "--total_num_shards", dest="num_shards", default=1, help="Number of shards into which seeds are divided") opt_parser.add_option( "-n", "--current_shard_number", dest="shard_number", default=0, help="Which shard number the current job is processing") opt_parser.add_option("-d", "--description", dest="description", default='090811', help="The description of the download files") (options, args) = opt_parser.parse_args() if len(args) != 1: opt_parser.error('Incorrect number of arguments') if not os.path.exists(args[0]): opt_parser.error("Couldn't find %s" % args[0]) else: input_file = args[0] try: num_shards = int(options.num_shards) except ValueError: opt_parser.error("--total_num_shards must be a number") try: shard_number = int(options.shard_number) except ValueError: opt_parser.error("--current_shard_number must be a number") f = open(input_file, "rU") family_accessions = [line.rstrip() for line in f.readlines()] f.close() shard_size = len(family_accessions) / (num_shards - 1) start = shard_number * shard_size end = min(start + shard_size, len(family_accessions)) shard_file = '/clusterfs/ohana/bpg/pfacts/%s_trees_%03d-of-%03d' \ % (options.date, (shard_number + 1), num_shards) print "Writing %s with ML trees from %s to %s" % ( shard_file, family_accessions[start], family_accessions[end - 1]) outf = open(shard_file, 'w') for family_accession in family_accessions[start:end]: dir = get_dir_of_family_accession(family_accession) tree_file = os.path.join(dir, '%s_subst.ml' % family_accession) try: inf = open(tree_file) tree_data = inf.read() inf.close() outf.write("%s\n" % family_accession) outf.write(tree_data) outf.write("//\n") except IOError: print "Could not write %s to the shard" % tree_file outf.close()
def main(): if len(sys.argv) < 2: print "Usage: %s <family_accession>" % sys.argv[0] sys.exit(0) family_accession = sys.argv[1] try: family_id = int(family_accession[4:]) except ValueError: print "Usage: %s <family_accession>" % sys.argv[0] sys.exit(1) family_dir = get_dir_of_family_accession(family_accession) if not os.path.exists(family_dir): print "Family %s directory not found" % family_accession sys.exit(1) os.chdir(family_dir) trees = Tree.objects.filter(family__id=family_id) had_error = False for tree in trees: filename = '%s_subst.%s' % (family_accession, tree.method) try: outf = open(filename, "w") tree.write_newick_to_handle(outf) outf.close() sys.stdout.write("Wrote %s\n" % filename) except IOError: sys.stderr.write("IOError while attempting to write %s\n" % filename) had_error = True if had_error: sys.exit(1) sys.exit(0)
def main(): # parse command line options usage = "%prog [options] family_accession_file" opt_parser = OptionParser(usage=usage) opt_parser.add_option("-i", "--input_file", dest="input", help="family_accession_file") opt_parser.add_option("-d", "--date", dest="date", default='090811', help="The date-stamp from the incremental download files") (options, args) = opt_parser.parse_args() if not options.input: opt_parser.error('Please specify the family_accession_file') if not os.path.exists(options.input): opt_parser.error("Couldn't find %s" % options.input) else: input_file = options.input f = open(input_file, "rU") family_accessions = [line.rstrip() for line in f.readlines()] f.close() # the hmm file that have all hmms (for search) hmm_file = '/clusterfs/ohana/bpg/pfacts/phylofacts3_%s.hmm' \ % (options.date) # the hmm file that have only the newly added hmms hmm_incremental_file = '/clusterfs/ohana/bpg/pfacts/phylofacts3_incremental_%s.hmm' \ % (options.date) print "Writing %s with HMMs" % (hmm_file) for family_accession in family_accessions: dir = get_dir_of_family_accession(family_accession) hmm = os.path.join(dir, '%s.hmm' % family_accession) try: # append newly added hmms to existing hmm file (for hmm search) cmd ="cat %s >> %s" % (hmm,hmm_file) os.system (cmd) # put newly added hmms to a new hmm file (for download) cmd ="cat %s >> %s" % (hmm,hmm_incremental_file) os.system (cmd) except IOError: print "Could not write %s to the hmm file" % hmm
def main(): # parse command line options usage = "%prog [options] family_accession_file" opt_parser = OptionParser(usage=usage) opt_parser.add_option("-i", "--input_file", dest="input", help="family_accession_file") opt_parser.add_option("-d", "--description", dest="description", default='090811', help="The description for download files") (options, args) = opt_parser.parse_args() if not options.input: opt_parser.error('Please specify the family_accession_file') if not os.path.exists(options.input): opt_parser.error("Couldn't find %s" % options.input) else: input_file = options.input f = open(input_file, "rU") family_accessions = [line.rstrip() for line in f.readlines()] f.close() hmm_file = 'downloads/%s.hmms' \ % (options.description) print "Writing %s with HMMs" % (hmm_file) for family_accession in family_accessions: dir = get_dir_of_family_accession(family_accession) hmm = os.path.join(dir, '%s.hmm' % family_accession) try: # put newly added hmms to a new hmm file (for download) cmd = "cat %s >> %s" % (hmm, hmm_file) os.system(cmd) except IOError: print "Could not write %s to the hmm file" % hmm
def get_phobius_results_for_family(acc): # This function submits a sequence to the phobius webserver, and parses the # response page. # # INPUT -> family accession. # OUTPUT -> Returns 3 lists each the same length: (regtype, regstart, regend) # # regtype - list of strings. each list entry is the regoin type for a phobius # predicted region. # regstart - list of ints. each list entry is the region start index for a phobius # predicted region. # regend - list of ints. each list entry is the region end index for a phobius # predicted region. # # USAGE: # # >>> from bpg.common.utils import get_phobius_tmhelix # >>> (type, start, end) = get_phobius_tmhelix.get_phobius_results_for_family('bpg0207937') # >>> type # ['TOPO_DOM:NON-CYTOPLASMIC.', 'TRANSMEM', 'TOPO_DOM:CYTOPLASMIC.', 'TRANSMEM', 'TOPO_DOM:NON-CYTOPLASMIC.', 'TRANSMEM', 'TOPO_DOM:CYTOPLASMIC.', 'TRANSMEM', 'TOPO_DOM:NON-CYTOPLASMIC.', 'TRANSMEM', 'TOPO_DOM:CYTOPLASMIC.', 'TRANSMEM', 'TOPO_DOM:NON-CYTOPLASMIC.', 'TRANSMEM', 'TOPO_DOM:CYTOPLASMIC.'] # >>> start # [1, 59, 80, 91, 117, 136, 157, 177, 200, 219, 248, 277, 299, 318, 337] # >>> end # [58, 79, 90, 116, 135, 156, 176, 199, 218, 247, 276, 298, 317, 336, 368] """ We had a problem with the phobius server. Now, instead of doing a web request, we will just take whatever info we have in the family directory .phobius file. home_response = mechanize.urlopen("http://phobius.sbc.su.se/") home_forms = mechanize.ParseResponse(home_response, backwards_compat=False) home_forms[0].set_all_readonly(False) home_forms[0].set_value(["nog"], name="format") home_forms[0].set_value(seq,"protseq") result_request = home_forms[0].click(type="submit") result_response = mechanize.urlopen(result_request) result_output = result_response.read() """ # Get .phobius file from the directory directory = get_dir_of_family_accession(acc) phobius_file = glob.glob(os.path.join(directory, '*.phobius')) if phobius_file: read_buffer = open(phobius_file[0], 'r') result_output = read_buffer.read() else: return ([], [], []) # Now we have an HTML response containing our TM helix info. Parse that. head = result_output[string.find(result_output, "\n", \ result_output.find("<h2>Phobius prediction</h2>")):len(result_output)] raw_table = head.split("//")[0] records = raw_table.split("\n") if (len(records) < 3): return ([], [], []) records = records[1:(len(records) - 1)] # remove the first and last empty entries # Now we have a list of records, iterate through it and pull out the region type, # region start and region end values and store these in lists to return. regtype = [] regstart = [] regend = [] for record in records: temprec = record.split() regstart.append(int(temprec[2])) regend.append(int(temprec[3])) if len(temprec) == 4: regtype.append(temprec[1]) elif len(temprec) == 5: regtype.append(temprec[1] + ":" + temprec[4]) else: regtype.append(temprec[1] + ":" + temprec[4] + "-" + temprec[5]) return (regtype, regstart, regend)
def main(): if len(sys.argv) < 2: usage() sys.exit(0) family_accession = sys.argv[1] try: family_id = int(family_accession[3:]) except ValueError: usage() sys.exit(1) try: family = Family.objects.get(id=family_id) if family.status == "bad": raise Family.DoesNotExist except Family.DoesNotExist: print "No family found with accession %s" % family_accession sys.exit(1) family_dir = get_dir_of_family_accession(family_accession) seed_path = os.path.join(family_dir, "seed.fa") if not os.path.exists(seed_path): if os.path.realpath(family_dir).find('TreeFam') >= 0: os.chdir(family_dir) possible_seed_files = glob.glob("*_HUMAN*.fa") candidates = set() swissprot_desc_re = re.compile('^%s$' % swissprot_desc_pat) for file in possible_seed_files: basename = os.path.splitext(file)[0] components = basename.split('_') if len(components) < 2 or components[1] != 'HUMAN': continue if swissprot_desc_re.match(components[0]) is None and \ uniprot_accession_re1.match(components[0]) is None and \ uniprot_accession_re2.match(components[0]) is None: continue if len(components) > 2: if len(components) != 4: continue try: start = int(components[2]) except ValueError: continue try: end = int(components[3]) except ValueError: continue candidates.add(file) if len(candidates) != 1: print "Seed file for family %s missing" % family_accession sys.exit(1) seed_path = os.path.join(family_dir, list(candidates)[0]) else: print "Seed file for family %s missing" % family_accession sys.exit(1) f = open(seed_path) seed_record = SeqIO.parse(f, "fasta").next() f.close() seed_seguid = CheckSum.seguid(seed_record.seq) seed_id = seed_record.id.strip('lcl|') print "%s: FlowerPower seed id %s" % (family_accession, seed_id) seed_accession = None recognizing_regexp = None # uniprot_accession_re1 recognizes a UniProt accession only if it is the # whole string, not if it is a substring for regexp in [ re.compile(uniprot_accession_pat1), re.compile(uniprot_accession_pat2), gi_re ]: m = regexp.search(seed_id) if m: seed_accession = m.group() recognizing_regexp = regexp break if seed_accession is None: print "Could not parse accession from seed id" sys.exit(1) sequences = Sequence.objects.filter(seguid=seed_seguid) sequence_headers = SequenceHeader.objects.filter(sequence__in=sequences) possible_sequence_headers = set() for sequence_header in sequence_headers: m = recognizing_regexp.search(sequence_header.header) if m: accession = m.group() if accession == seed_accession: if len(sequence_header.header) >= 4 and \ sequence_header.header[0:4] == 'lcl|': possible_sequence_headers = set([sequence_header]) break if sequence_header.header.find('|') < 0: possible_sequence_headers = set([sequence_header]) break possible_sequence_headers.add(sequence_header) if len(possible_sequence_headers) > 1: alns = TreeNodeAlignment.objects.filter( tree_node=family.canonical_root_node(), sequence_header__in=possible_sequence_headers) possible_sequence_headers = set([aln.sequence_header for aln in alns]) print "%s: Found %d possible sequence headers" % ( family_accession, len(possible_sequence_headers)) for seqhdr in possible_sequence_headers: print "%s: possible sequence header %s" % (family_accession, seqhdr.header) if len(possible_sequence_headers) == 1: seed_sequence_header = list(possible_sequence_headers)[0] print "Assigning seed sequence header id %d to family %s" \ % (seed_sequence_header.id, family_accession) family.seed_sequence_header = seed_sequence_header family.save()