Ejemplo n.º 1
0
def create_target_similarity_network_normalised(target_seq, name):
    import math
    target_names = list(target_seq.keys())
    seq_info = []
    total = len(target_names)
    for i in tqdm(range(len(target_names))):
        seq1 = target_seq[target_names[i]]
        #print(seq1)
        #raise Exception('stop')
        for j in range(i + 1, len(target_names)):
            #print(i,'  ',j)
            seq2 = target_seqs[target_names[j]]
            try:
                alignment, score, start_end_positions = skbio.alignment.local_pairwise_align_ssw(
                    Protein(seq1), Protein(seq2), substitution_matrix=blosum50)
            except:
                score = 0
            #alignment, score, start_end_positions = skbio.alignment.local_pairwise_align_protein(Protein(p1_s), Protein(p2_s))
            new_score = float(score) / (math.sqrt(len(seq1)) *
                                        math.sqrt(len(seq2)))
            seq_info.append([target_names[i], target_names[j], new_score])
        #t2 = time.time()
        #print(t2-t1)
    #print(seq_info[0:10])
    name = 'data/' + name + 'target_similarity.pkl'
    utils.save_any_obj_pkl(seq_info, name)
Ejemplo n.º 2
0
 def test_to_dict_non_empty(self):
     seqs = [
         Protein('PAW', metadata={'id': 42}),
         Protein('WAP', metadata={'id': -999})
     ]
     msa = TabularMSA(seqs, key='id')
     self.assertEqual(msa.to_dict(), {42: seqs[0], -999: seqs[1]})
Ejemplo n.º 3
0
    def test_translate_start_with_start_codon(self):
        # trim before start codon, replace with M. ensure alternative start
        # codons following the start codon aren't replaced with M. ensure
        # default behavior for handling stop codons is retained
        seq = RNA('CAUUUGCUGAAAUGA')
        exp = Protein('MLK*')
        for start in {'require', 'optional'}:
            obs = self.sgc.translate(seq, start=start)
            self.assertEqual(obs, exp)

        # ignore start codon replacement and trimming; just translate
        exp = Protein('HLLK*')
        obs = self.sgc.translate(seq, start='ignore')
        self.assertEqual(obs, exp)

        # just a start codon, no replacement necessary
        seq = RNA('AUG')
        exp = Protein('M')
        for start in {'require', 'optional', 'ignore'}:
            obs = self.sgc.translate(seq, start=start)
            self.assertEqual(obs, exp)

        # single alternative start codon
        seq = RNA('CUG')
        exp = Protein('M')
        for start in {'require', 'optional'}:
            obs = self.sgc.translate(seq, start=start)
            self.assertEqual(obs, exp)

        exp = Protein('L')
        obs = self.sgc.translate(seq, start='ignore')
        self.assertEqual(obs, exp)
Ejemplo n.º 4
0
def proteinAlign(seq1,
                 seq2,
                 gap_open_penalty,
                 gap_extend_penalty,
                 local=False):
    seq1 = seq1.upper()
    seq2 = seq2.upper()

    if local:
        aln, score, _ = local_pairwise_align(Protein(seq1), Protein(seq2),
                                             gap_open_penalty,
                                             gap_extend_penalty, blosum50)
    else:
        aln, score, _ = global_pairwise_align(Protein(seq1),
                                              Protein(seq2),
                                              gap_open_penalty,
                                              gap_extend_penalty,
                                              blosum50,
                                              penalize_terminal_gaps=True)

    response = {
        'aln1':
        str(aln[0]),
        'aln2':
        str(aln[1]),
        'score':
        score,
        'similarity':
        float('{:.2f}'.format(aln[0].match_frequency(aln[1], relative=True) *
                              100))
    }

    return response
Ejemplo n.º 5
0
    def test_eq(self):
        amino_acids = 'AMPM' * 16
        starts = '--M-' * 16

        equal_gcs = [
            GeneticCode(amino_acids, starts),
            # name should be ignored
            GeneticCode(amino_acids, starts, 'foo'),
            # metadata/positional metadata should be ignored if Sequence
            # subclass is provided
            GeneticCode(
                Protein(amino_acids, metadata={'foo': 'bar'}),
                Protein(starts, positional_metadata={'foo': range(64)}))
        ]

        # every gc should be equal to itself
        for gc in equal_gcs:
            self.assertTrue(gc == gc)
            self.assertFalse(gc != gc)

        # every pair of gcs should be equal. use permutations instead of
        # combinations to test that comparing gc1 to gc2 and gc2 to gc1 are
        # both equal
        for gc1, gc2 in itertools.permutations(equal_gcs, 2):
            self.assertTrue(gc1 == gc2)
            self.assertFalse(gc1 != gc2)
Ejemplo n.º 6
0
    def test_translate_trim_to_cds(self):
        seq = RNA('UAAUUGCCUCAUUAAUAACAAUGA')

        # find first start codon, trim all before it, convert alternative start
        # codon to M, finally trim to first stop codon following the start
        # codon
        exp = Protein('MPH')
        for param in {'require', 'optional'}:
            obs = self.sgc.translate(seq, start=param, stop=param)
            self.assertEqual(obs, exp)

        exp = Protein('*LPH**Q*')
        obs = self.sgc.translate(seq, start='ignore', stop='ignore')
        self.assertEqual(obs, exp)

        # alternative reading frame disrupts cds:
        #     AAUUGCCUCAUUAAUAACAAUGA
        #     NCLINNN
        with six.assertRaisesRegex(self, ValueError,
                                   'reading_frame=2.*start=\'require\''):
            self.sgc.translate(seq, reading_frame=2, start='require')
        with six.assertRaisesRegex(self, ValueError,
                                   'reading_frame=2.*stop=\'require\''):
            self.sgc.translate(seq, reading_frame=2, stop='require')

        exp = Protein('NCLINNN')
        for param in {'ignore', 'optional'}:
            obs = self.sgc.translate(seq,
                                     reading_frame=2,
                                     start=param,
                                     stop=param)
            self.assertEqual(obs, exp)
Ejemplo n.º 7
0
def calculate_sim(target_protein, ):

    protein_list = target_protein.seq.tolist()
    protein_num = len(protein_list)
    sim_matrix = np.zeros(shape=[protein_num, protein_num])
    print(f'==Start== with protein : {protein_num}')
    for i in range(len(protein_list)):
        for j in range(len(protein_list)):

            protein_similarity = local_pairwise_align_protein(
                seq1=Protein(protein_list[i]),
                seq2=Protein(protein_list[j]),
            )
            print(protein_similarity)

            sim_matrix[i, j] = protein_similarity[1]
            print(sim_matrix)

    sim_value = np.zeros(shape=sim_matrix.shape)

    for i in range(protein_num):
        for j in range(protein_num):
            value = (sim_matrix[i, j] + sim_matrix[j, i]) / (sim_matrix[i, i] +
                                                             sim_matrix[j, j])

            sim_value[i, j] = value
            sim_value[j, i] = value

            print(sim_value)

    return sim_matrix, sim_value
Ejemplo n.º 8
0
    def test_translate_varied_genetic_codes(self):
        # spot check using a few NCBI and custom genetic codes to translate
        seq = RNA('AAUGAUGUGACUAUCAGAAGG')

        # table_id=2
        exp = Protein('NDVTI**')
        obs = GeneticCode.from_ncbi(2).translate(seq)
        self.assertEqual(obs, exp)

        exp = Protein('MTI')
        obs = GeneticCode.from_ncbi(2).translate(seq,
                                                 start='require',
                                                 stop='require')
        self.assertEqual(obs, exp)

        # table_id=22
        exp = Protein('NDVTIRR')
        obs = GeneticCode.from_ncbi(22).translate(seq)
        self.assertEqual(obs, exp)

        with six.assertRaisesRegex(self, ValueError,
                                   'reading_frame=1.*start=\'require\''):
            GeneticCode.from_ncbi(22).translate(seq,
                                                start='require',
                                                stop='require')

        # custom, no start codons
        gc = GeneticCode('MWN*' * 16, '-' * 64)
        exp = Protein('MM*MWN*')
        obs = gc.translate(seq)
        self.assertEqual(obs, exp)

        with six.assertRaisesRegex(self, ValueError,
                                   'reading_frame=1.*start=\'require\''):
            gc.translate(seq, start='require', stop='require')
Ejemplo n.º 9
0
    def test_constructor_not_monomorphic(self):
        with six.assertRaisesRegex(self, TypeError, 'mixed types.*RNA.*DNA'):
            TabularMSA([DNA(''), RNA('')])

        with six.assertRaisesRegex(self, TypeError,
                                   'mixed types.*float.*Protein'):
            TabularMSA([Protein(''), Protein(''), 42.0, Protein('')])
Ejemplo n.º 10
0
 def test_all_gappy(self):
     aln = TabularMSA(
         [Protein('---'),
          Protein('---'),
          Protein('ALR'),
          Protein('ELR')])
     with pytest.raises(Exception):
         _ = msa_fun.del_gappy_cols(aln, gap_threshold=0.5)
Ejemplo n.º 11
0
    def test_translate_ncbi_table_id(self):
        for seq in RNA('AAAUUUAUGCAU'), DNA('AAATTTATGCAT'):
            # default
            obs = seq.translate()
            self.assertEqual(obs, Protein('KFMH'))

            obs = seq.translate(9)
            self.assertEqual(obs, Protein('NFMH'))
Ejemplo n.º 12
0
 def test_translate_six_frames_preserves_metadata(self):
     seq = RNA('AUG', metadata={'foo': 'bar', 'baz': 42},
               positional_metadata={'foo': range(3)})
     obs = list(self.sgc.translate_six_frames(seq))[:2]
     # metadata retained, positional metadata dropped
     self.assertEqual(
         obs,
         [Protein('M', metadata={'foo': 'bar', 'baz': 42}),
          Protein('', metadata={'foo': 'bar', 'baz': 42})])
Ejemplo n.º 13
0
    def test_global_pairwise_align_protein_penalize_terminal_gaps(self):
        obs_msa, obs_score, obs_start_end = global_pairwise_align_protein(
            Protein("HEAGAWGHEE"), Protein("PAWHEAE"), gap_open_penalty=10.,
            gap_extend_penalty=5., penalize_terminal_gaps=True)

        self.assertEqual(obs_msa, TabularMSA([Protein("HEAGAWGHEE"),
                                              Protein("---PAWHEAE")]))
        self.assertEqual(obs_score, 1.0)
        self.assertEqual(obs_start_end, [(0, 9), (0, 6)])
Ejemplo n.º 14
0
 def test_no_gappy_2(self):
     # Output should be identical to input
     aln = TabularMSA(
         [Protein('-LV'),
          Protein('A-L'),
          Protein('AL-'),
          Protein('ELR')])
     out_aln, gappy_idxs = msa_fun.del_gappy_cols(aln, gap_threshold=0.5)
     assert out_aln == aln
     assert len(gappy_idxs) == 0
Ejemplo n.º 15
0
    def test_alphabet(self):
        expected = set("ABCDEFGHIJKLMNOPQRSTUVWXYZ-.*")
        self.assertIs(type(Protein.alphabet), set)
        self.assertEqual(Protein.alphabet, expected)

        Protein.alphabet.add("&")
        self.assertEqual(Protein.alphabet, expected)
        self.assertEqual(Protein('').alphabet, expected)

        with self.assertRaises(AttributeError):
            Protein('').alphabet = set("ABCD")
Ejemplo n.º 16
0
 def test_same_as_using_StripedSmithWaterman_object_Protein(self):
     query_sequence = 'HEAGAWGHEE'
     target_sequence = 'PAWHEAE'
     query = StripedSmithWaterman(query_sequence,
                                  protein=True,
                                  substitution_matrix=blosum50)
     align1 = query(target_sequence)
     align2 = local_pairwise_align_ssw(Protein(query_sequence),
                                       Protein(target_sequence),
                                       substitution_matrix=blosum50)
     self._check_TabularMSA_to_AlignmentStructure(align2, align1, Protein)
Ejemplo n.º 17
0
 def test_init_varied_equivalent_input(self):
     for args in (('M' * 64, '-' * 64), (Protein('M' * 64),
                                         Protein('-' * 64)),
                  (Sequence('M' * 64), Sequence('-' * 64))):
         gc = GeneticCode(*args)
         self.assertEqual(gc.name, '')
         self.assertEqual(gc._amino_acids, Protein('M' * 64))
         self.assertEqual(gc._starts, Protein('-' * 64))
         npt.assert_array_equal(gc._m_character_codon,
                                np.asarray([0, 0, 0], dtype=np.uint8))
         self.assertEqual(len(gc._start_codons), 0)
Ejemplo n.º 18
0
    def test_stop_chars(self):
        expected = set('*')
        self.assertIs(type(Protein.stop_chars), set)
        self.assertEqual(Protein.stop_chars, expected)

        Protein.stop_chars.add("JO")
        self.assertEqual(Protein.stop_chars, expected)
        self.assertEqual(Protein('').stop_chars, expected)

        with self.assertRaises(AttributeError):
            Protein('').stop_chars = set("^&")
Ejemplo n.º 19
0
    def test_motif_n_glycosylation(self):
        seq = Protein("ACDFFACGNPSL")
        self.assertEqual(list(seq.find_motifs("N-glycosylation")), [])

        seq = Protein("ACDFNFTACGNPSL")
        self.assertEqual(list(seq.find_motifs("N-glycosylation")),
                         [slice(4, 8)])

        seq = Protein("AC-DFN-FTACGNPSL")
        self.assertEqual(
            list(seq.find_motifs("N-glycosylation", ignore=seq.gaps())),
            [slice(5, 10)])
Ejemplo n.º 20
0
def align(seq1, seq2, go, ge):
    ''' Perform alignment using scikit-bio for any two given sequences, gap penalties, and score matrix. '''
    a, b = read_seq(seq1, seq2)
    # scoreMatrix = read_matrix(sys.argv[1])
    alignment, score, start_end_positions = local_pairwise_align_protein(
        Protein(a, lowercase=True),
        Protein(b, lowercase=True),
        gap_open_penalty=go,
        gap_extend_penalty=ge,
        substitution_matrix=None)
    print("\nScore:", score)
    return score
Ejemplo n.º 21
0
 def test_translate_six_frames_preserves_metadata(self):
     metadata = {'foo': 'bar', 'baz': 42}
     positional_metadata = {'foo': range(3)}
     for seq in (RNA('AUG', metadata=metadata,
                     positional_metadata=positional_metadata),
                 DNA('ATG', metadata=metadata,
                     positional_metadata=positional_metadata)):
         obs = list(seq.translate_six_frames())[:2]
         # metadata retained, positional metadata dropped
         self.assertEqual(
             obs,
             [Protein('M', metadata={'foo': 'bar', 'baz': 42}),
              Protein('', metadata={'foo': 'bar', 'baz': 42})])
Ejemplo n.º 22
0
 def test_genbank_to_protein(self):
     i = 0
     exp = self.multi[i]
     obs = _genbank_to_protein(self.multi_fp, seq_num=i+1)
     exp = Protein(exp[0], metadata=exp[1],
                   lowercase=True, positional_metadata=exp[2])
     self.assertEqual(exp, obs)
Ejemplo n.º 23
0
    def test_process_1(self):
        aln = TabularMSA([Protein('AL-'), Protein('VL-'), Protein('MLA')])
        gap_thr = 0.5

        exp_num = [[AA_TABLE['A']], [AA_TABLE['V']], [AA_TABLE['M']]]
        exp_bin = [[
            1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        ], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]]

        num_mtx, bin_mtx, gappy_idxs, constant_idxs = preprocess.process(
            aln, gap_thr, AA_TABLE)
        assert np.array_equal(exp_num, num_mtx)
        assert np.array_equal(exp_bin, bin_mtx)
        assert gappy_idxs == [2]
        assert constant_idxs == [1]
Ejemplo n.º 24
0
    def test_translate_six_frames_passes_parameters_through(self):
        for seq in RNA('UUUAUGUGGUGA'), DNA('TTTATGTGGTGA'):
            # mix of args and kwargs
            obs = next(seq.translate_six_frames(11, start='require',
                                                stop='require'))
            self.assertEqual(obs, Protein('MW'))

            # kwargs only
            obs = next(seq.translate_six_frames(genetic_code=11,
                                                start='require',
                                                stop='require'))
            self.assertEqual(obs, Protein('MW'))

            # args only
            obs = next(seq.translate_six_frames(11, 'require', 'require'))
            self.assertEqual(obs, Protein('MW'))
Ejemplo n.º 25
0
 def test_translate_reading_frame_non_empty_translation(self):
     seq = RNA('AUGGUGGAA')  # rc = UUCCACCAU
     for reading_frame, exp_str in ((1, 'MVE'), (2, 'WW'), (3, 'GG'),
                                    (-1, 'FHH'), (-2, 'ST'), (-3, 'PP')):
         exp = Protein(exp_str)
         obs = self.sgc.translate(seq, reading_frame=reading_frame)
         self.assertEqual(obs, exp)
Ejemplo n.º 26
0
 def test_translate_preserves_metadata(self):
     obs = self.sgc.translate(
         RNA('AUG', metadata={'foo': 'bar', 'baz': 42},
             positional_metadata={'foo': range(3)}))
     # metadata retained, positional metadata dropped
     self.assertEqual(obs, Protein('M',
                                   metadata={'foo': 'bar', 'baz': 42}))
Ejemplo n.º 27
0
 def test_sam_to_protein(self):
     self.maxDiff = None
     obs = _sam_to_protein(self.single_fp)
     exp = Protein(self.single_exp[0],
                   self.single_exp[1])
     self.assertEqual(sorted(obs.metadata.items()),
                      sorted(exp.metadata.items()))
     self.assertEqual(str(obs), str(exp))
Ejemplo n.º 28
0
    def test_process_2(self):
        # Invert columns 1 and 2 with respect to the previous example
        aln = TabularMSA([Protein('A-L'), Protein('V-L'), Protein('MAL')])
        gap_thr = 0.5

        exp_num = [[AA_TABLE['A']], [AA_TABLE['V']], [AA_TABLE['M']]]
        exp_bin = [[
            1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        ], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]]

        num_mtx, bin_mtx, gappy_idxs, constant_idxs = preprocess.process(
            aln, gap_thr, AA_TABLE)
        assert np.array_equal(exp_num, num_mtx)
        assert np.array_equal(exp_bin, bin_mtx)
        assert gappy_idxs == [1]
        assert constant_idxs == [1]
Ejemplo n.º 29
0
 def test_degenerate_map(self):
     exp = {
         'B': set(['D', 'N']), 'Z': set(['E', 'Q']),
         'X': set(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M',
                   'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'])
     }
     self.assertEqual(Protein("").degenerate_map, exp)
     self.assertEqual(Protein.degenerate_map, exp)
Ejemplo n.º 30
0
    def test_metadata_setter_invalid_type(self):
        msa = TabularMSA([Protein('PAW')], metadata={123: 456})

        for md in (None, 0, 'a', ('f', 'o', 'o'), np.array([]),
                   pd.DataFrame()):
            with six.assertRaisesRegex(self, TypeError,
                                       'metadata must be a dict'):
                msa.metadata = md
            self.assertEqual(msa.metadata, {123: 456})
Ejemplo n.º 31
0
    def test_no_protein_support(self):
        """Testing no protein support for embl"""
        # TODO: add protein support

        # a fake protein line.
        handle = io.StringIO('ID   M14399; SV 1; linear; mRNA; STD; '
                             'PRO; 63 AA.\n//\n')

        with self.assertRaisesRegex(EMBLFormatError,
                                    r"There's no protein support for EMBL "
                                    "record"):
            # read a protein record
            Protein.read(handle)

        # return to 0
        handle.seek(0)

        with self.assertRaisesRegex(EMBLFormatError,
                                    r"There's no protein support for EMBL "
                                    "record"):
            # read a generic record
            skbio.io.read(handle, format='embl')
Ejemplo n.º 32
0
    def test_motif_n_glycosylation(self):
        seq = Protein("ACDFFACGNPSL")
        self.assertEqual(list(seq.find_motifs("N-glycosylation")), [])

        seq = Protein("ACDFNFTACGNPSL")
        self.assertEqual(list(seq.find_motifs("N-glycosylation")),
                         [slice(4, 8)])

        seq = Protein("AC-DFN-FTACGNPSL")
        self.assertEqual(list(seq.find_motifs("N-glycosylation",
                                              ignore=seq.gaps())),
                         [slice(5, 10)])
Ejemplo n.º 33
0
def mask_sequence(hhsuite_fp, fullsequence_fp, subsequences_fp=None,
                  min_prob=None, max_pvalue=None, max_evalue=None,
                  min_fragment_length=0):
    """ Splits a protein sequence according to HHsuits results.

    The returned sub-sequences will seamlessly build the full sequence if
    re-concatenated.

    Parameters
    ----------
    hhsuite_fp : str
        Filepath to HHblits/HHsearch output.
    fullsequence_fp : str
        Filepath to the protein sequence of the original query.
    subsequences_fp : str
        Filepath to which sub-sequences are written as a multiple fasta file.
        Each sequence makes up one header and one sequence file, i.e. sequences
        are not wrapped.
        Two files will be produced, suffixed by '.match' and '.non_match'. The
        first holds sub-sequences of hits, the second holds the none-hit
        covered subsequences.
        Default: None, i.e. no file is written.
    min_prob: float
        Minimal probability of a hit to be included in the resulting list.
        Note: probabilities are in the range of 100.0 to 0.0.
        Default: None, i.e. no filtering on probability.
    max_pvalue: float
        Maximal P-value of a hit to be included in the resulting list.
        Default: None, i.e. no filtering on P-value.
    max_evalue: float
        Maximal E-value of a hit to be included in the resulting list.
        Default: None, i.e. no filtering on E-value.
    min_fragment_length: int
        Minimal fragment length of a hit to be included in the resulting list.
        Default: 0, i.e. no filtering on fragment length.

    Returns
    -------
    [(str, str)] where first component is a fasta header, while the second is
    its fasta sequence.

    Raises
    ------
    IOError
        If the file cannot be written.

    Notes
    -----
    A hit must satisfy ALL filtering options (min_prob, max_pvalue, max_evalue,
    min_fragment_length) to be included in the resulting list.
    """

    # parse hits from file
    hits = parse_pdb_match(hhsuite_fp)

    # filter hits
    if min_prob is not None:
        hits = [hit for hit in hits if hit['Probab'] >= min_prob]
    if max_pvalue is not None:
        hits = [hit for hit in hits if hit['P-value'] <= max_pvalue]
    if max_evalue is not None:
        hits = [hit for hit in hits if hit['E-value'] <= max_evalue]
    if min_fragment_length is not None:
        hits = [hit for hit in hits if frag_size(hit) >= min_fragment_length]

    # read the original protein file, used to run HHsearch
    p = Protein.read(fullsequence_fp, seq_num=1)
    query_id = p.metadata['id']
    query_desc = p.metadata['description']

    results = {'match': [], 'non_match': []}
    # select non overlapping positive hits
    subseqs_pos = select_hits(hits, e_value_threshold=999999)

    for hit in subseqs_pos:
        _id = get_q_id(hit)
        match_id = hit['Hit'].split()[0]
        header = "%s %s %s" % (correct_header_positions(
            query_id,
            hit['alignment'][_id]['start'],
            hit['alignment'][_id]['end']), '# %s' % match_id, query_desc)
        seq = hit['alignment'][_id]['sequence'].replace('-', '')
        results['match'].append((header, seq, hit['alignment'][_id]['start']))

    # collect gaps between positive hits
    subseqs_neg = report_uncovered_subsequences(subseqs_pos, str(p),
                                                min_fragment_length)
    for hit in subseqs_neg:
        header = "%s %s" % (correct_header_positions(
            query_id,
            hit['start'],
            hit['end']), query_desc)
        seq = hit['sequence']
        results['non_match'].append((header, seq, hit['start']))

    # write sub-sequences to a multiple fasta file, sequences are un-wrapped
    try:
        # sort by start position
        for type_ in results:
            results[type_] = sorted(results[type_], key=lambda x: x[2])

        if subsequences_fp is not None:
            for type_ in results:
                f = open('%s.%s' % (subsequences_fp, type_), 'w')
                for res in results[type_]:
                    f.write(">%s\n%s\n" % res[:2])
                f.close()

        # removing the start position component from all subsequences
        return {type_: list(map(lambda x: x[:2], results[type_]))
                for type_ in results}
    except IOError:
        raise IOError('Cannot write to file "%s"' % subsequences_fp)