Beispiel #1
0
def test_read_fasta_protein(protein: Path, protein_gzip: Path):
    expected = """
QIKDLLVSSSTDLDTTLVLVNAIYFKGMWKTAFNAEDTREMPFHVTKQESKPVQMMCMNNSFNVATLPAE
KMKILELPFASGDLSMLVLLPDEVSDLERIEKTINFEKLTEWTNPNTMEKRRVKVYLPQMKIEEKYNLTS
VLMALGMTDLFIPSANLTGISSAESLKISQAVHGAFMELSEDGIEMAGSTGVIEDIKHSPESEQFRADHP
FLFLIKHNPTNTIVYFGRYWSP
    """.replace("\n", "").strip()

    with read_fasta(protein) as file:
        item = file.read_item()
        assert item.defline == "P01013 GENE X PROTEIN (OVALBUMIN-RELATED)"
        assert item.has_desc
        assert item.desc == "GENE X PROTEIN (OVALBUMIN-RELATED)"
        assert item.id == "P01013"
        assert item.sequence == expected

    assert len(list(read_fasta(protein))) == 1

    with read_fasta(protein_gzip) as file:
        item = file.read_item()
        assert item.defline == "P01013 GENE X PROTEIN (OVALBUMIN-RELATED)"
        assert item.has_desc
        assert item.desc == "GENE X PROTEIN (OVALBUMIN-RELATED)"
        assert item.id == "P01013"
        assert item.sequence == expected

    assert len(list(read_fasta(protein))) == 1
Beispiel #2
0
def test_write_fasta(tmp_path: Path):

    defline = ["defline1", "defline2 description"]
    sequence = ["ABCD", "ABCD" * 100]

    os.chdir(tmp_path)
    with write_fasta("output.faa") as writer:
        writer.write_item(defline[0], sequence[0])
        writer.write_item(defline[1], sequence[1])

    with read_fasta("output.faa") as reader:
        item = reader.read_item()
        assert item.defline == defline[0]
        assert item.sequence == sequence[0]

        item = reader.read_item()
        assert item.defline == defline[1]
        assert item.sequence == sequence[1]

    with write_fasta("output.faa.xz") as writer:
        writer.write_item(defline[0], sequence[0])
        writer.write_item(defline[1], sequence[1])

    with read_fasta("output.faa.xz") as reader:
        item = reader.read_item()
        assert item.defline == defline[0]
        assert item.sequence == sequence[0]

        item = reader.read_item()
        assert item.defline == defline[1]
        assert item.sequence == sequence[1]
Beispiel #3
0
def extract_sequences_2(path, out_folder, input_beds):
    # path = 'data/genomes/'
    # out_folder = 'sdregions8'
    # input_beds = 'same8'

    #first we create dict for genome(s):
    main_fa_dict = dict()
    if os.path.isdir(path):
        for i in os.listdir(path):
            path_ = i.split('_')
            # print (path_)
            if len(path_) > 2 and path_[1] == 'hard' and path_[2] == '50.fa':
                specie = path_[0]  # 'hg19'
                # print (specie)
                # continue
                for item in read_fasta(f"{path}{specie}_hard_50.fa"):
                    main_fa_dict[f'{specie}#{item.defline}'] = item.sequence
                    # print(item)
                # print (len(main_fa_dict))
    else:
        specie = path.split('/')[-1].split('_')[0]
        # print (specie, path)
        for item in read_fasta(f"{path}"):
            # print (f'{specie}#{item.defline}')
            main_fa_dict[f'{specie}#{item.defline}'] = item.sequence

    # Now we iterate trought different colors files and
    count_array = []
    num_ov_neg = 0

    for i in os.listdir(input_beds):
        # print (i)
        modify_sequences(main_fa_dict, f'{input_beds}/{i}')
        # sys.exit(1)
        new_fa = open(f'{out_folder}/{i}.fa', 'w')
        d__ = dict()
        d__, i__, j__ = build_dict_2(f'{input_beds}/{i}', dict())
        count_array.append((i__, j__))
        # print (f'Dictionary built {i__}, {j__}')
        for chr_ in d__:
            for interval in d__[chr_]:
                if chr_[-1] == '+':
                    new_fa.write(
                        f'>{chr_}-{interval.begin}-{interval.end}\n{main_fa_dict[chr_[:-1]][interval.begin : interval.end]}\n'
                    )
                else:
                    num_ov_neg += 1
                    new_fa.write(
                        f'>{chr_}-{interval.begin}-{interval.end}\n{Seq(main_fa_dict[chr_[:-1]][interval.begin : interval.end]).reverse_complement()}\n'
                    )

        new_fa.close()
    count_array.sort(reverse=True)
    print(f'Num of rev comp: {num_ov_neg}')
    print('10 biggest clusters: (#of sequences, coverage)')
    for i in count_array[:10]:
        print(i)
Beispiel #4
0
def _test_read_fasta_correct(filepath: Path):
    deflines = ["ID1", "ID2", "ID3", "ID4"]
    sequences = ["GAGUUA", "CAUAACAAATT", "AAGAA", "AAGAA"]

    f = read_fasta(filepath)
    item = f.read_item()
    assert item.defline == deflines[0]
    assert item.id == deflines[0]
    assert not item.has_desc
    assert item.sequence == sequences[0]

    item = f.read_item()
    assert item.defline == deflines[1]
    assert item.id == deflines[1]
    assert not item.has_desc
    assert item.sequence == sequences[1]

    item = f.read_item()
    assert item.defline == deflines[2]
    assert item.id == deflines[2]
    assert not item.has_desc
    assert item.sequence == sequences[2]

    item = f.read_item()
    assert item.defline == deflines[3]
    assert item.id == deflines[3]
    assert not item.has_desc
    assert item.sequence == sequences[3]

    with pytest.raises(StopIteration):
        f.read_item()

    f.close()

    f = read_fasta(filepath)
    for i, item in enumerate(f):
        assert item.defline == deflines[i]
        assert item.sequence == sequences[i]
    f.close()

    f = read_fasta(filepath)
    items = f.read_items()
    for i, defline in enumerate(deflines):
        assert items[i].defline == defline
        assert items[i].sequence == sequences[i]
    f.close()

    with read_fasta(filepath) as f:
        for i, item in enumerate(f):
            assert item.defline == deflines[i]
            assert item.sequence == sequences[i]

    with read_fasta(filepath) as f:
        f.close()
def get_data(filename: str):

    fasta_gz = pkg_resources.read_binary(dcp.test, f"{filename}.fasta.gz")
    fasta = gzip.decompress(fasta_gz).decode()

    json_gz = pkg_resources.read_binary(dcp.test, f"{filename}.json.gz")
    data = json.loads(gzip.decompress(json_gz))

    desired = {}
    for d in data:
        key = (d["multiple_hits"], d["hmmer3_compat"], d["target"],
               d["profile"])
        desired[key] = {
            "alt_loglik": d["alt_loglik"],
            "alt_path": d["alt_path"],
            "alt_codon_stream": d["alt_codon_stream"],
            "alt_amino_stream": d["alt_amino_stream"],
            "null_loglik": d["null_loglik"],
            "null_path": d["null_path"],
            "null_codon_stream": d["null_codon_stream"],
            "null_amino_stream": d["null_amino_stream"],
        }

    return {
        "hmm": dcp.example.get(f"{filename}.hmm"),
        "targets": fr.read_fasta(StringIO(fasta)).read_items(),
        "desired": desired,
    }
Beispiel #6
0
def extract_sequences(path, out_folder, input_beds):
    # path = 'data/genomes/'
    # out_folder = 'sdregions8'
    # input_beds = 'same8'

    for i in os.listdir(path):
        main_fa_dict = dict()

        path_ = i.split('_')
        # print (path_)
        if len(path_) > 2 and path_[1] == 'hard' and path_[2] == '50.fa':
            specie = path_[0]  # 'hg19'
            print(specie)
            # continue
            new_fa = open(f'{out_folder}/SD_regions_{specie}.fa', 'w')
            for item in read_fasta(f"{path}{specie}_hard_50.fa"):
                main_fa_dict[f'{specie}#{item.defline}'] = item.sequence
                # print(item)
            print(len(main_fa_dict))

            d__ = dict()
            d__, i__, j__ = build_dict(
                f'{input_beds}/{specie}_{specie}/aligned/', dict())

            # print (f'Dictionary built {i__}, {j__}')
            for i in d__:
                for interval in d__[i]:
                    new_fa.write(
                        f'>{i}-{interval.begin}-{interval.end}\n{main_fa_dict[i][interval.begin : interval.end]}\n'
                    )
            new_fa.close()
def open_read_frames(fasta_file, format_print = False):
    dna_strands = read_fasta(fasta_file)
    dna_strand = next(iter(dna_strands.values()))
    
    # Take into account the reverse complement
    dna_strand_rev = reverse_complement(dna_strand)
    
    stop_codons = [k for k,v in dna_codon.items() if v == 'Stop']
    # (M)ethionine is considered to be the start codon
    start_codon = [k for k,v in dna_codon.items() if v == 'M'][0]

    # Elements between Methionine and stop codons can only be triplets
    pattern = f'(({start_codon})(...)*?)({"|".join(stop_codons)})'
    
    all_cand_prot = []
    for strand in [dna_strand, dna_strand_rev]:
        candidate_dna = [g[0] for g in re.findall(pattern, strand, overlapped = True)]
        if format_print:
            print(candidate_dna)
        candidate_prot = [translate_dna_to_prot(cand) for cand in candidate_dna]
        all_cand_prot = list(set(all_cand_prot + candidate_prot))

    if format_print:
        print('\n'.join(all_cand_prot))
        [print(len(cand)) for cand in all_cand_prot]

    return all_cand_prot
def consensus_profile(fasta_file, format_print=False):
    dna_dict = read_fasta(fasta_file)

    dna_strands = list(dna_dict.values())

    base_position = {0: 'A', 1: 'C', 2: 'G', 3: 'T'}

    profile_dict = {
        'A': [0] * len(dna_strands[0]),
        'C': [0] * len(dna_strands[0]),
        'G': [0] * len(dna_strands[0]),
        'T': [0] * len(dna_strands[0])
    }

    for strand in dna_strands:
        for position, nucleotide in enumerate(strand):
            profile_dict[nucleotide][position] += 1

    profile_matrix = np.matrix(list(profile_dict.values()))

    consensus = ''
    for col in profile_matrix.T:
        # Get the position of most frequent symbol for each column (j)
        position = np.argmax(col)
        consensus += base_position[position]

    if format_print:
        print(consensus)
        for idx, elem in enumerate(profile_dict.values()):
            print(base_position[idx] + ': ' +
                  ' '.join([str(int) for int in elem]))

    return profile_dict, consensus
def _load_fasta(rna_fasta):
    rna_ids = set()
    with read_fasta(rna_fasta) as fh:
        for rec in fh:
            _load_fasta_rec(rna_ids, rec)
    if len(rna_ids) == 0:
        raise LrgaspException("no RNA sequences in FASTA")
    return frozenset(rna_ids)
Beispiel #10
0
def correct_cdr(cdr_stream, seqs_stream, threshold, out_stream):
    full_seqs = fr.read_fasta(seqs_stream)
    clusters = fr.read_cluster(cdr_stream)
    logger.info("Cdr correction started, {0} cdr clusters".format(len(clusters)))
    cl_id = 0
    for c_name, c_seqs in clusters.iteritems():
        for newc_name, newc_seqs in correct_cluster(c_seqs, full_seqs, threshold).iteritems():
            out_cluster("Cluster_{0}".format(cl_id), newc_seqs, full_seqs, out_stream)
            cl_id += 1
    logger.info("Cdr correction finished, {0} cdr clusters".format(cl_id))
Beispiel #11
0
def test_read_fasta_damaged(damaged1, damaged2, damaged3):

    with read_fasta(damaged1) as f:
        with pytest.raises(ParsingError) as excinfo:
            f.read_item()
        e: ParsingError = excinfo.value
        assert e.line_number == 1

    with read_fasta(damaged2) as f:
        with pytest.raises(ParsingError) as excinfo:
            f.read_item()
        e: ParsingError = excinfo.value
        assert e.line_number == 2

    with read_fasta(damaged3) as f:
        f.read_item()
        with pytest.raises(ParsingError) as excinfo:
            f.read_item()
        e: ParsingError = excinfo.value
        assert e.line_number == 4
def remove_dups(stream_in, stream_out):
    seqs = fr.read_fasta(stream_in)
    logger.info("Removing duplicates from {0} sequences".format(len(seqs)))
    counter = 0
    new_seqs = {}
    seq_count = {}
    for h in seqs:
        seq_count[seqs[h]] = seq_count.get(seqs[h], 0) + 1
    for s in seq_count:
        new_seqs["Seq{0}_{1}".format(counter, seq_count[s])] = s
        counter += 1
    fr.write_fasta(new_seqs, stream_out)
    logger.info("Done, {0} sequences left".format(len(new_seqs)))
Beispiel #13
0
def rna_splice(fasta_file):
    dna_strands = iter(read_fasta(fasta_file).values())

    # The first value is DNA, the rest are introns
    dna_strand = next(dna_strands)
    introns = list(dna_strands)

    for intron in introns:
        dna_strand = dna_strand.replace(intron, '')

    rna = transcribe_dna_to_rna(dna_strand)
    prot = translate_rna_into_protein(rna)


    print(prot)
    return prot
Beispiel #14
0
def muscle(fasta_dict):
    logger.debug("Running muscle for {0} seqs".format(len(fasta_dict)))

    in_file = tempfile.SpooledTemporaryFile()
    out_file = tempfile.SpooledTemporaryFile()
    fr.write_fasta(fasta_dict, in_file)
    in_file.flush()
    in_file.seek(0)

    cmdline = [MUSCLE_EXEC, "-diags", "-maxiters", "2", "-quiet"]
    proc = subprocess.Popen(cmdline, stdin=in_file, stdout=out_file)
    proc.wait()
    out_file.flush()
    out_file.seek(0)

    logger.debug("Muscle finished")
    return fr.read_fasta(out_file)
Beispiel #15
0
def overlap_graphs(fasta_file, k, format_print = False):
    dna_strands = read_fasta(fasta_file)
    
    adj_list = []
    for k_s, s in dna_strands.items():
        other_strands = dict(dna_strands)
        other_strands.pop(k_s)
        
        for k_t, t in other_strands.items():
            if (s[-k:] == t[:k]):
                if format_print:
                    print(k_s + ' ' + k_t)
                
                adj_list.append((k_s, k_t))
            
    return adj_list
    
def locate_restriction_sites(fasta_file, format_print=False):
    dna_strands = read_fasta(fasta_file)
    dna_strand = next(iter(dna_strands.values()))
    rev_comp = reverse_complement(dna_strand)

    locations = []
    for idx, elem in enumerate(dna_strand):
        # The reverse palindromes can be between 4 and 12 chars
        for n in range(12, 4 - 1, -1):
            forw = dna_strand[idx:idx + n]
            reve = rev_comp[-(idx + 1):-(idx + n + 1):-1][::-1]

            if len(forw) == n and forw == reve:
                hit = f'{idx+1} {n}'
                locations.append(hit)
                if format_print: print(hit)
                break

    return locations
Beispiel #17
0
def find_cdr3(in_stream, start_seqs, end_seqs, out_stream):
    MIN_CDR_LEN = 30
    MAX_CDR_LEN = 90
    MATCH = 2
    MISSMATCH = -2
    INDEL = -1

    logger.info("Finding cdr regions started".format(MIN_CDR_LEN, MAX_CDR_LEN))
    seqs = fr.read_fasta(in_stream)
    start_align = defaultdict(list)
    end_align = defaultdict(list)
    for qry in start_seqs:
        for h, alns in ext_tools.xalign(seqs, qry, MATCH, MISSMATCH, INDEL).iteritems():
            start_align[h].extend(alns)
    for qry in end_seqs:
        for h, alns in ext_tools.xalign(seqs, qry, MATCH, MISSMATCH, INDEL).iteritems():
            end_align[h].extend(alns)

    for h, seq in seqs.iteritems():
        candidates = []
        for start in start_align[h]:
            for end in end_align[h]:
                dist = end.begin - start.begin
                if (MIN_CDR_LEN <= dist and dist <= MAX_CDR_LEN and
                                        start.begin > len(seq) / 2):
                    candidates.append(AlignInfo(start.begin, end.begin - 1, start.score + end.score))

        if len(candidates) == 0:
            logger.debug("{0} : no cdr found".format(h))
            continue

        max_score = 0
        cand = None
        for c in candidates:
            if c.score > max_score:
                cand = c
                max_score = c.score

        cdr = seq[cand.begin : cand.end + 1]
        out_stream.write(">{0}\n{1}\n".format(h, cdr))
        logger.debug("{0} : cdr found with score {1}".format(h, max_score))
    logger.info("Finding cdr regions finished")
def find_longest_shared_motif(fasta_file, format_print = False):
    dna_strands = read_fasta(fasta_file)
    
    # Get shortest strand in collection
    # Get all substrings starting from longest to shortest

    # For each one try to find them in other strands
    #   If it is not found in one, stop loop and go to next string

    dna_strings = list(dna_strands.values())
    shortest_string = min(dna_strings, key = len)

    dna_strings.remove(shortest_string)

    # Takes about 3m, could improve by checking each substring wo combs list
    for n_chars in range(len(shortest_string), 1, -1):
        n_char_combs = ordered_combinations(shortest_string, n_chars)

        for substring in n_char_combs:
            pattern_found = check_for_pattern(dna_strings, substring) 
            if pattern_found:
                return substring

    return 'No shared motif found'
Beispiel #19
0
def main():

    if len(sys.argv) < 2:
        print("You need to provide inputs. try -h or --help for help")
        sys.exit()

    # Log file stuff
    if not args.log_file:
        log_file = "out_log.log"
    else:
        log_file = args.log_file

    logging.basicConfig(filename=log_file, filemode='w',
                        format='[%(asctime)s] %(message)s',
                        level=getattr(logging, "INFO"))

    # taking fasta file
    if args.in_msa:
        sequences = read_fasta(args.in_msa)
    else:
        logging.error("You did not provide input msa file, -f, check -h for help")
        sys.exit()

    # reading sequence names if provided
    seq_names = dict()
    if args.seq_names:
        if os.path.exists(args.seq_names):
            with open(args.seq_names) as in_file:
                for raw_l in in_file:
                    line = raw_l.strip().split("\t")
                    if line[0] not in sequences:
                        logging.error("The sequence {} in file {} does not exist in the "
                                      "fasta file {} provided".format(line[0], args.seq_names, args.in_msa))
                    else:
                        seq_names[line[0]] = line[1]
        else:
            logging.error("File {} provided as sequence names tsv does not exist".format(args.seq_names))

    else:
        for key in sequences.keys():
            seq_names[key] = key

    # building graph
    logging.info("constructing graph...")
    graph = msa_graph(sequences, seq_names)
    graph.colors = seq_names

    if args.compact:
        logging.info("compacting linear paths in graph...")
        # Compacting just merges stretches of single nodes together
        graph.compact()

    logging.info("sorting the graph toplogocially...")
    # I use topological sorting to write the paths in order
    graph.sort()  # topological sorting
    logging.info("adding paths to graph...")
    graph.add_paths()  # adds paths to graph
    logging.info("writing graph...")

    write_gfa(graph, args.out_gfa)  # outputting
    if args.nodes_dict:
        logging.info("writing nodes info json file...")
        graph.nodes_info(args.nodes_dict)
    logging.info("finished...")
Beispiel #20
0
from fasta_reader import read_fasta


def LCS(text):
    for k in range(len(text[0]), 1, -1):
        for start in range(len(text[0]) - k + 1):
            curr = text[0][start:start + k]
            found = True
            for i in range(1, len(text)):
                if not curr in text[i]:
                    found = False
                    break
            if found:
                return curr


if __name__ == '__main__':
    _, sequences = read_fasta(r'D:/Downloads/rosalind_lcsm.txt')
    print(LCS(sequences))
Beispiel #21
0
def hmmer_filter(db_filepath):
    hmmer = HMMER(db_filepath)
    hmmer.timeout = 60
    if not hmmer.is_indexed:
        hmmer.index()

    heuristic = True
    cut_ga = True

    targets = fr.read_fasta("oamino.fasta.bak").read_items()
    for i, tgt in enumerate(targets):
        assert tgt.id == f"item{i+1}"
    gffs = gff_io.read_gff("output.gff.bak").read_items()

    acc_targets = {}
    for gff in gffs:
        di = gff.attributes_asdict()
        acc = di["Profile_acc"]
        if acc not in acc_targets:
            acc_targets[acc] = []
        j = int(di["ID"][4:]) - 1
        acc_targets[acc].append(targets[j])

    scores = {}
    for acc, tgts in acc_targets.items():
        txt = [">" + t.defline + "\n" + t.sequence for t in tgts]
        result = hmmer.search(
            StringIO("\n".join(txt)),
            "/dev/null",
            tblout=True,
            heuristic=heuristic,
            cut_ga=cut_ga,
            hmmkey=acc,
            Z=1,
        )
        for row in result.tbl:
            e_value = row.full_sequence.e_value
            score = row.full_sequence.score
            if score.lower() == "nan":
                continue
            bias = row.full_sequence.bias
            itemid = row.target.name
            scores[itemid] = (e_value, score, bias)

    itemid_convert = {}
    j = 1
    with open("output.gff", "w") as out2:
        out2.write("##gff-version 3\n")
        for gff in gffs:
            di = gff.attributes_asdict()
            score = scores.get(di["ID"], None)
            if score is None:
                continue
            itemid_convert[di["ID"]] = f"item{j}"
            attrs = gff.attributes.replace(di["ID"], f"item{j}")
            cols = [
                gff.seqid,
                gff.source,
                gff.type,
                str(gff.start),
                str(gff.end),
                str(gff.score),
                gff.strand,
                str(gff.phase),
                attrs +
                f";Bias={score[2]};E-value={score[0]};Score={score[1]}",
            ]
            out2.write("\t".join(cols))
            out2.write("\n")
            j += 1

    with fr.write_fasta("oamino.fasta") as f:
        for tgt in targets:
            if tgt.defline not in itemid_convert:
                continue
            f.write_item(itemid_convert[tgt.defline], tgt.sequence)

    ctargets = fr.read_fasta("ocodon.fasta.bak").read_items()
    for i, tgt in enumerate(ctargets):
        assert tgt.id == f"item{i+1}"

    with fr.write_fasta("ocodon.fasta") as f:
        for tgt in ctargets:
            if tgt.defline not in itemid_convert:
                continue
            f.write_item(itemid_convert[tgt.defline], tgt.sequence)
Beispiel #22
0
from trace_back import traceback

if len(sys.argv) != 6:
    print "\n\nERROR: incorrect arguments given to script.\n\nUSAGE: sh SDS_BLAST.sh <sequence_1.fasta> <sequence_2.fasta> mismatch_score gap_open_Penalty gap_extension_penalty\n\nEXAMPLE: sh SDS_BLAST.sh Data/seq_11.fasta Data/seq_22.fasta -20 40 2\n\n"
    sys.exit(1)

seqs1_inputs = str(sys.argv[1])
seqs2_inputs = str(sys.argv[2])
match = int(10)
mismatch = int(sys.argv[3])
gap_open = int(sys.argv[4])
gap_extend = int(sys.argv[5])

gap_init_penalty = gap_open + gap_extend

seqs1_headers, seqs1 = read_fasta(seqs1_inputs)
seqs2_headers, seqs2 = read_fasta(seqs2_inputs)

S, D, I, score, start_A, start_B = populate_matrices(seqs1[0], seqs2[0], match,
                                                     mismatch, gap_open,
                                                     gap_extend)
align_1, align_2, end_A, end_B, num_matches, num_mismatches, num_gaps, middle_array = traceback(
    S, D, I, seqs1[0], seqs2[0], gap_init_penalty, start_A, start_B)

print "\n" + "SUCCESSFULLY ALIGNED SEQUENCES:".center(
    40) + "\n\n(A) %s  by  (B) %s \n" % (seqs1_headers[0], seqs2_headers[0])
print "LENGTH_OF_SEQUENCE_A = %s\nLENGTH_OF_SEQUENCE_B = %s\nLENGTH_OF_ALIGNMENT = %s" % (
    len(seqs1[0]), len(seqs2[0]), str(len(align_1)))
print "START_POS_A = %s\nSTART_POS_B = %s\nEND_POS_A = %s\nEND_POS_B = %s\n" % (
    start_A, start_B, end_A, end_B)
a_1 = []
Beispiel #23
0
from w_code import wcode
from id_seq import seq_id

if len(sys.argv) != 4:
    print "\n\nERROR: incorrect arguments given to script.\n\nUSAGE: python <MSA.py> <sequence.fasta> <word_model.txt> wlcut\n\n"
    sys.exit(1)

with open(sys.argv[2], 'r') as wm:
    word_model = wm.readline().strip()
wlcut = int(sys.argv[3])
w = len(word_model)
if w < 1:
    print "\n\nERROR: word_model must be of legnth >= 1"
    sys.exit(1)

seqs_headers, seqs_inputs = read_fasta(sys.argv[1])
C = ''
k = len(seqs_inputs)
st = []
length = []
p = -1
for seq in seqs_inputs:
    C += (seq + '#')
    st.append(p + 1)
    p += len(seq) + 1
    length.append(len(seq))

word_code = wrd_code(C, word_model)
positions = range(0, len(C))
sup_wrd = sw_code(positions, word_code, w, wlcut)