def calculate_sequence_lengths(in_file, delimiter):
  seq_lengths = defaultdict(lambda: defaultdict(int))

  for header, seq in binf.parse_fasta(in_file):
    seq_id, gene_id, isoform_id = extract_ids(header, delimiter)
    seq_lengths[gene_id][isoform_id] = len(seq)

  return seq_lengths
Example #2
0
def calculate_sequence_lengths(in_file, delimiter):
    seq_lengths = defaultdict(lambda: defaultdict(int))

    for header, seq in binf.parse_fasta(in_file):
        seq_id, gene_id, isoform_id = extract_ids(header, delimiter)
        seq_lengths[gene_id][isoform_id] = len(seq)

    return seq_lengths
def print_longest_isoforms_for_each_gene(in_file, delimiter, seq_lengths):
  for header, seq in binf.parse_fasta(in_file):
    seq_id, gene_id, isoform_id = extract_ids(header, delimiter)
    if gene_id not in seq_lengths:
      continue

    max_isoform_len = find_max_isoform_length_for_gene(gene_id, seq_lengths)

    if len(seq) == max_isoform_len:
      binf.write_fasta_seq(sys.stdout, header, seq)
      # To mark a given gene as already having had its longest isoform printed,
      # remove the gene from seq_lengths.
      del seq_lengths[gene_id]
Example #4
0
def print_longest_isoforms_for_each_gene(in_file, delimiter, seq_lengths):
    for header, seq in binf.parse_fasta(in_file):
        seq_id, gene_id, isoform_id = extract_ids(header, delimiter)
        if gene_id not in seq_lengths:
            continue

        max_isoform_len = find_max_isoform_length_for_gene(
            gene_id, seq_lengths)

        if len(seq) == max_isoform_len:
            binf.write_fasta_seq(sys.stdout, header, seq)
            # To mark a given gene as already having had its longest isoform printed,
            # remove the gene from seq_lengths.
            del seq_lengths[gene_id]
def main():
    seq_set_id = sys.argv[1]
    munged_fasta_filename = sys.argv[2]
    mapping_filename = sys.argv[3]

    name_mapping = {}
    count = 1

    with open(munged_fasta_filename, "w") as munged_fasta_file:
        for seq_id, seq in binf.parse_fasta(sys.stdin):
            new_name = "%s_prot%s" % (seq_set_id, count)
            name_mapping[new_name] = seq_id
            binf.write_fasta_seq(munged_fasta_file, new_name, seq)
            count += 1

    with open(mapping_filename, "w") as mapping_file:
        json.dump(name_mapping, mapping_file)
def main():
    seq_set_id = sys.argv[1]
    munged_fasta_filename = sys.argv[2]
    mapping_filename = sys.argv[3]

    name_mapping = {}
    count = 1

    with open(munged_fasta_filename, 'w') as munged_fasta_file:
        for seq_id, seq in binf.parse_fasta(sys.stdin):
            new_name = '%s_prot%s' % (seq_set_id, count)
            name_mapping[new_name] = seq_id
            binf.write_fasta_seq(munged_fasta_file, new_name, seq)
            count += 1

    with open(mapping_filename, 'w') as mapping_file:
        json.dump(name_mapping, mapping_file)