Python get_dir_of_family_accession Beispiele, bpg.common.utils.dir_of_family.get_dir_of_family_accession Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: write_bestFamily_MLtrees_to_shard.py Projekt: berkeleyphylogenomics/BPG_utilities

def main():
    # parse command line options
    usage = "%prog [options] family_accession_file"
    opt_parser = OptionParser(usage=usage)
    opt_parser.add_option("-s",
                          "--total_num_shards",
                          dest="num_shards",
                          default=1,
                          help="Number of shards into which seeds are divided")
    opt_parser.add_option(
        "-n",
        "--current_shard_number",
        dest="shard_number",
        default=0,
        help="Which shard number the current job is processing")
    opt_parser.add_option("-d",
                          "--description",
                          dest="description",
                          default='090811',
                          help="The description of the download files")

    (options, args) = opt_parser.parse_args()
    if len(args) != 1:
        opt_parser.error('Incorrect number of arguments')
    if not os.path.exists(args[0]):
        opt_parser.error("Couldn't find %s" % args[0])
    else:
        input_file = args[0]
    try:
        num_shards = int(options.num_shards)
    except ValueError:
        opt_parser.error("--total_num_shards must be a number")
    try:
        shard_number = int(options.shard_number)
    except ValueError:
        opt_parser.error("--current_shard_number must be a number")
    f = open(input_file, "rU")
    family_accessions = [line.rstrip() for line in f.readlines()]
    f.close()
    shard_size = len(family_accessions) / (num_shards - 1)
    start = shard_number * shard_size
    end = min(start + shard_size, len(family_accessions))
    shard_file = '/clusterfs/ohana/bpg/pfacts/%s_trees_%03d-of-%03d' \
                % (options.date, (shard_number + 1), num_shards)
    print "Writing %s with ML trees from %s to %s" % (
        shard_file, family_accessions[start], family_accessions[end - 1])
    outf = open(shard_file, 'w')
    for family_accession in family_accessions[start:end]:
        dir = get_dir_of_family_accession(family_accession)
        tree_file = os.path.join(dir, '%s_subst.ml' % family_accession)
        try:
            inf = open(tree_file)
            tree_data = inf.read()
            inf.close()
            outf.write("%s\n" % family_accession)
            outf.write(tree_data)
            outf.write("//\n")
        except IOError:
            print "Could not write %s to the shard" % tree_file
    outf.close()

Beispiel #2

0

Datei anzeigen

def main():
    if len(sys.argv) < 2:
        print "Usage: %s <family_accession>" % sys.argv[0]
        sys.exit(0)

    family_accession = sys.argv[1]
    try:
        family_id = int(family_accession[4:])
    except ValueError:
        print "Usage: %s <family_accession>" % sys.argv[0]
        sys.exit(1)

    family_dir = get_dir_of_family_accession(family_accession)
    if not os.path.exists(family_dir):
        print "Family %s directory not found" % family_accession
        sys.exit(1)
    os.chdir(family_dir)

    trees = Tree.objects.filter(family__id=family_id)
    had_error = False
    for tree in trees:
        filename = '%s_subst.%s' % (family_accession, tree.method)
        try:
            outf = open(filename, "w")
            tree.write_newick_to_handle(outf)
            outf.close()
            sys.stdout.write("Wrote %s\n" % filename)
        except IOError:
            sys.stderr.write("IOError while attempting to write %s\n" %
                             filename)
            had_error = True

    if had_error:
        sys.exit(1)
    sys.exit(0)

Beispiel #3

0

Datei anzeigen

def main():
  # parse command line options
  usage = "%prog [options] family_accession_file"
  opt_parser = OptionParser(usage=usage)
  opt_parser.add_option("-i", "--input_file", dest="input", 
                        help="family_accession_file")
  opt_parser.add_option("-d", "--date", dest="date",
                        default='090811',
                        help="The date-stamp from the incremental download files")

  (options, args) = opt_parser.parse_args()
  if not options.input:
    opt_parser.error('Please specify the family_accession_file')
  if not os.path.exists(options.input):
    opt_parser.error("Couldn't find %s" % options.input)
  else:
    input_file = options.input
  f = open(input_file, "rU")
  family_accessions = [line.rstrip() for line in f.readlines()]
  f.close()
# the hmm file that have all hmms (for search) 
  hmm_file = '/clusterfs/ohana/bpg/pfacts/phylofacts3_%s.hmm' \
              % (options.date)
# the hmm file that have only the newly added hmms 
  hmm_incremental_file = '/clusterfs/ohana/bpg/pfacts/phylofacts3_incremental_%s.hmm' \
              % (options.date)
  print "Writing %s with HMMs" % (hmm_file)
  for family_accession in family_accessions:
      dir = get_dir_of_family_accession(family_accession)
      hmm = os.path.join(dir, '%s.hmm' % family_accession)
      try:
# append newly added hmms to existing hmm file (for hmm search)
        cmd ="cat %s >> %s" % (hmm,hmm_file)
        os.system (cmd)

# put newly added hmms to a new hmm file (for download)
        cmd ="cat %s >> %s" % (hmm,hmm_incremental_file)
        os.system (cmd)
      except IOError:
        print "Could not write %s to the hmm file" % hmm

Beispiel #4

0

Datei anzeigen

def main():
    # parse command line options
    usage = "%prog [options] family_accession_file"
    opt_parser = OptionParser(usage=usage)
    opt_parser.add_option("-i",
                          "--input_file",
                          dest="input",
                          help="family_accession_file")
    opt_parser.add_option("-d",
                          "--description",
                          dest="description",
                          default='090811',
                          help="The description for download files")

    (options, args) = opt_parser.parse_args()
    if not options.input:
        opt_parser.error('Please specify the family_accession_file')
    if not os.path.exists(options.input):
        opt_parser.error("Couldn't find %s" % options.input)
    else:
        input_file = options.input
    f = open(input_file, "rU")
    family_accessions = [line.rstrip() for line in f.readlines()]
    f.close()
    hmm_file = 'downloads/%s.hmms' \
                % (options.description)
    print "Writing %s with HMMs" % (hmm_file)
    for family_accession in family_accessions:
        dir = get_dir_of_family_accession(family_accession)
        hmm = os.path.join(dir, '%s.hmm' % family_accession)
        try:

            # put newly added hmms to a new hmm file (for download)
            cmd = "cat %s >> %s" % (hmm, hmm_file)
            os.system(cmd)
        except IOError:
            print "Could not write %s to the hmm file" % hmm

Beispiel #5

0

Datei anzeigen

Datei: get_phobius_tmhelix.py Projekt: berkeleyphylogenomics/BPG_utilities

def get_phobius_results_for_family(acc):

    # This function submits a sequence to the phobius webserver, and parses the
    # response page.
    #
    # INPUT -> family accession.
    # OUTPUT -> Returns 3 lists each the same length: (regtype, regstart, regend)
    #
    # regtype - list of strings.  each list entry is the regoin type for a phobius
    # predicted region.
    # regstart - list of ints.  each list entry is the region start index for a phobius
    # predicted region.
    # regend - list of ints.   each list entry is the region end index for a phobius
    # predicted region.
    #
    # USAGE:
    #
    # >>> from bpg.common.utils import get_phobius_tmhelix
    # >>> (type, start, end) = get_phobius_tmhelix.get_phobius_results_for_family('bpg0207937')
    # >>> type
    # ['TOPO_DOM:NON-CYTOPLASMIC.', 'TRANSMEM', 'TOPO_DOM:CYTOPLASMIC.', 'TRANSMEM', 'TOPO_DOM:NON-CYTOPLASMIC.', 'TRANSMEM', 'TOPO_DOM:CYTOPLASMIC.', 'TRANSMEM', 'TOPO_DOM:NON-CYTOPLASMIC.', 'TRANSMEM', 'TOPO_DOM:CYTOPLASMIC.', 'TRANSMEM', 'TOPO_DOM:NON-CYTOPLASMIC.', 'TRANSMEM', 'TOPO_DOM:CYTOPLASMIC.']
    # >>> start
    # [1, 59, 80, 91, 117, 136, 157, 177, 200, 219, 248, 277, 299, 318, 337]
    # >>> end
    # [58, 79, 90, 116, 135, 156, 176, 199, 218, 247, 276, 298, 317, 336, 368]
    """
    We had a problem with the phobius server.  Now, instead of doing a web request, we will
    just take whatever info we have in the family directory .phobius file.
    home_response = mechanize.urlopen("http://phobius.sbc.su.se/")
    home_forms = mechanize.ParseResponse(home_response, backwards_compat=False)
    home_forms[0].set_all_readonly(False)
    home_forms[0].set_value(["nog"], name="format")
    home_forms[0].set_value(seq,"protseq")
    result_request = home_forms[0].click(type="submit") 
    result_response = mechanize.urlopen(result_request)
    result_output = result_response.read()
    """

    # Get .phobius file from the directory
    directory = get_dir_of_family_accession(acc)
    phobius_file = glob.glob(os.path.join(directory, '*.phobius'))
    if phobius_file:
        read_buffer = open(phobius_file[0], 'r')
        result_output = read_buffer.read()
    else:
        return ([], [], [])
    # Now we have an HTML response containing our TM helix info.  Parse that.

    head = result_output[string.find(result_output, "\n", \
       result_output.find("<h2>Phobius prediction</h2>")):len(result_output)]
    raw_table = head.split("//")[0]
    records = raw_table.split("\n")
    if (len(records) < 3):
        return ([], [], [])
    records = records[1:(len(records) -
                         1)]  # remove the first and last empty entries

    # Now we have a list of records, iterate through it and pull out the region type,
    # region start and region end values and store these in lists to return.

    regtype = []
    regstart = []
    regend = []
    for record in records:
        temprec = record.split()
        regstart.append(int(temprec[2]))
        regend.append(int(temprec[3]))
        if len(temprec) == 4:
            regtype.append(temprec[1])
        elif len(temprec) == 5:
            regtype.append(temprec[1] + ":" + temprec[4])
        else:
            regtype.append(temprec[1] + ":" + temprec[4] + "-" + temprec[5])

    return (regtype, regstart, regend)

Beispiel #6

0

Datei anzeigen

def main():
    if len(sys.argv) < 2:
        usage()
        sys.exit(0)

    family_accession = sys.argv[1]
    try:
        family_id = int(family_accession[3:])
    except ValueError:
        usage()
        sys.exit(1)

    try:
        family = Family.objects.get(id=family_id)
        if family.status == "bad":
            raise Family.DoesNotExist
    except Family.DoesNotExist:
        print "No family found with accession %s" % family_accession
        sys.exit(1)

    family_dir = get_dir_of_family_accession(family_accession)
    seed_path = os.path.join(family_dir, "seed.fa")
    if not os.path.exists(seed_path):
        if os.path.realpath(family_dir).find('TreeFam') >= 0:
            os.chdir(family_dir)
            possible_seed_files = glob.glob("*_HUMAN*.fa")
            candidates = set()
            swissprot_desc_re = re.compile('^%s$' % swissprot_desc_pat)
            for file in possible_seed_files:
                basename = os.path.splitext(file)[0]
                components = basename.split('_')
                if len(components) < 2 or components[1] != 'HUMAN':
                    continue
                if swissprot_desc_re.match(components[0]) is None and \
                    uniprot_accession_re1.match(components[0]) is None and \
                    uniprot_accession_re2.match(components[0]) is None:
                    continue
                if len(components) > 2:
                    if len(components) != 4:
                        continue
                    try:
                        start = int(components[2])
                    except ValueError:
                        continue
                    try:
                        end = int(components[3])
                    except ValueError:
                        continue
                candidates.add(file)
            if len(candidates) != 1:
                print "Seed file for family %s missing" % family_accession
                sys.exit(1)
            seed_path = os.path.join(family_dir, list(candidates)[0])
        else:
            print "Seed file for family %s missing" % family_accession
            sys.exit(1)

    f = open(seed_path)
    seed_record = SeqIO.parse(f, "fasta").next()
    f.close()
    seed_seguid = CheckSum.seguid(seed_record.seq)

    seed_id = seed_record.id.strip('lcl|')
    print "%s: FlowerPower seed id %s" % (family_accession, seed_id)
    seed_accession = None
    recognizing_regexp = None
    # uniprot_accession_re1 recognizes a UniProt accession only if it is the
    # whole string, not if it is a substring
    for regexp in [
            re.compile(uniprot_accession_pat1),
            re.compile(uniprot_accession_pat2), gi_re
    ]:
        m = regexp.search(seed_id)
        if m:
            seed_accession = m.group()
            recognizing_regexp = regexp
            break

    if seed_accession is None:
        print "Could not parse accession from seed id"
        sys.exit(1)

    sequences = Sequence.objects.filter(seguid=seed_seguid)
    sequence_headers = SequenceHeader.objects.filter(sequence__in=sequences)
    possible_sequence_headers = set()
    for sequence_header in sequence_headers:
        m = recognizing_regexp.search(sequence_header.header)
        if m:
            accession = m.group()
            if accession == seed_accession:
                if len(sequence_header.header) >= 4 and \
                    sequence_header.header[0:4] == 'lcl|':
                    possible_sequence_headers = set([sequence_header])
                    break
                if sequence_header.header.find('|') < 0:
                    possible_sequence_headers = set([sequence_header])
                    break
                possible_sequence_headers.add(sequence_header)
    if len(possible_sequence_headers) > 1:
        alns = TreeNodeAlignment.objects.filter(
            tree_node=family.canonical_root_node(),
            sequence_header__in=possible_sequence_headers)
        possible_sequence_headers = set([aln.sequence_header for aln in alns])

    print "%s: Found %d possible sequence headers" % (
        family_accession, len(possible_sequence_headers))
    for seqhdr in possible_sequence_headers:
        print "%s: possible sequence header %s" % (family_accession,
                                                   seqhdr.header)

    if len(possible_sequence_headers) == 1:
        seed_sequence_header = list(possible_sequence_headers)[0]
        print "Assigning seed sequence header id %d to family %s" \
            % (seed_sequence_header.id, family_accession)
        family.seed_sequence_header = seed_sequence_header
        family.save()