예제 #1
0
파일: search_uniq.py 프로젝트: Nanguage/UBW
def main(input, blastndb, output, probe_length, search_step, evalue,
         blastn_tmpdir, threads):
    """
    Search unique mapped probe(sub-sequence)
    within a series of sequences stored in a fasta file.

    \b
    For example:
    select 30 candidate probe regions with length 500bp, firstly,
    ```
    $ python uniformly_spaced.py data/hg19.fa ./candidate.fa chr1:89000000-90000000 -n 30 -l 500
    ```
    then select unique maped probe(sub-sequence) from it.
    ```
    $ python search_uniq.py candidate.fa example/blastn_db/hg19 probe.fa
    ```

    \b
    Args
    ----
    input : str
        Path to input fasta file.
    blastndb : str
        Path to blastn database.
        build with `makeblastdb` command.
    output : str
        Path to output fasta file.
    
    """
    with open(input) as f:
        input_seqs = FastaIO.FastaIterator(f)
        probes = search_passed_probes(input_seqs, blastndb, evalue,
                                      probe_length, search_step, blastn_tmpdir,
                                      threads)
        save_fasta(probes, output)
예제 #2
0
def load_data(k, stride, pos_fasta, neg_fasta):
    vocab = Vocabulary(k=k)

    X = []
    n_pos = 0
    n_neg = 0
    for fasta in pos_fasta, neg_fasta:
        with open(fasta) as f:
            for s in tqdm(FastaIO.FastaIterator(f)):
                seq = str(s.seq)
                if vocab.unknow_char in seq:
                    continue
                try:
                    x = vocab.kmer_count(seq, stride)
                except AssertionError:
                    continue
                X.append(x)
                if fasta == pos_fasta:
                    n_pos += 1
                else:
                    n_neg += 1

    X = np.vstack(X)
    y = np.hstack([np.ones(n_pos), np.zeros(n_neg)])
    return X, y
예제 #3
0
def input_text_to_df(input_text):
    """Converts fasta contents to a df with columns sequence_name and sequence."""
    with io.StringIO(initial_value=input_text) as f:
        fasta_records = list(FastaIO.FastaIterator(f))
        fasta_df = pd.DataFrame([(f.name, str(f.seq)) for f in fasta_records],
                                columns=['sequence_name', 'sequence'])

    return fasta_df
예제 #4
0
def _assert_fasta_parsable(input_text):
    with io.StringIO(initial_value=input_text) as f:
        fasta_itr = FastaIO.FastaIterator(f)
        end_iteration_sentinel = object()

        # Avoid parsing the entire FASTA contents by using `next`.
        # A malformed FASTA file will have no entries in its FastaIterator.
        # This is unfortunate (instead of it throwing an error).
        if next(fasta_itr, end_iteration_sentinel) is end_iteration_sentinel:
            raise ValueError(
                'Failed to parse any input from fasta file. '
                'Consider checking the formatting of your fasta file. '
                'First bit of contents from the fasta file was\n'
                '{}'.format(input_text.splitlines()[:3]))
예제 #5
0
파일: tr2gene.py 프로젝트: Acribbs/scflow
    help="An ensembl cdna fasta file with the properly formatted header")

parser.add_argument("--output",
                    default=None,
                    type=str,
                    help="output tr2gene file")

args = parser.parse_args()

L.info("args:")
print(args)

outf = iotools.open_file(args.output, "w")

with iotools.open_file(args.fasta, "r") as handle:
    for record in FastaIO.FastaIterator(handle):

        description = record.description
        trans = description.split(" ")[0]
        m = re.search('gene:(\S+)', description)
        gene = m.group(1)
        try:
            x = re.search('gene_symbol:(\S+)', description)
            symbol = x.group(1)
        except Exception:
            pass

        outf.write("%s\t%s\t%s\n" % (trans, gene, symbol))

outf.close()