Exemple #1
0
def extract(infile, sample):

    log = pct.create_logger()

    if not sample:
        if infile == '-':
            sample = 'stdin'
        else:
            sample = infile

    with pct.open(infile) as f:

        sys.stdout.write(
            'sample\torientation\tinteraction_type\tditag_length\t'
            'insert_size\tfragment_seperation\n')

        for i, line in enumerate(f):
            if line.startswith('@'):
                continue
            else:
                try:
                    read1 = pct.Sam(line)
                    read2 = pct.Sam(next(f))
                except StopIteration:
                    log.exception('Odd number of alignments in file')
                    sys.exit(1)
                sys.stdout.write(
                    f'{sample}\t{read1.optional["or:Z"]}\t'
                    f'{read1.optional["it:Z"]}\t{read1.optional["dt:i"]}\t'
                    f'{read1.optional["is:i"]}\t{read1.optional["fs:i"]}\n')
Exemple #2
0
def process(infile, digest):

    log = pct.create_logger()

    with pct.open(infile) as in_obj, pct.open(digest) as digest:

        d = process_digest(digest)
        for line in in_obj:
            if line.startswith("@"):
                sys.stdout.write(line)
                continue
            else:
                try:
                    read1 = pct.Sam(line)
                    read2 = pct.Sam(next(in_obj))
                except StopIteration:
                    log.exception("Odd number of alignments in file")
                filter_stats = run_filter(read1, read2, d)
                read1.optional['or:Z'] = filter_stats['orientation']
                read2.optional['or:Z'] = filter_stats['orientation']
                read1.optional['it:Z'] = filter_stats['interaction']
                read2.optional['it:Z'] = filter_stats['interaction']
                read1.optional['dt:i'] = filter_stats['ditag_length']
                read2.optional['dt:i'] = filter_stats['ditag_length']
                read1.optional['is:i'] = filter_stats['insert_size']
                read2.optional['is:i'] = filter_stats['insert_size']
                read1.optional['fs:i'] = filter_stats['fragment_seperation']
                read2.optional['fs:i'] = filter_stats['fragment_seperation']
                read1.optional['fn:i'] = filter_stats['read1_fragment']
                read2.optional['fn:i'] = filter_stats['read2_fragment']
                sys.stdout.write(read1.get_record())
                sys.stdout.write(read2.get_record())
Exemple #3
0
def find_cut_sites(ref_seq, ref, restriction):

    log = pct.create_logger()

    if not ref_seq:
        log.error(f'Reference {ref} contains no sequence.')
        ec = 1
    elif invalid_seq(ref_seq):
        log.error(f'Invalid FASTA character in {ref}.')
        ec = 1
    else:
        overhang = restriction.index('^')
        site = restriction.replace('^', '')
        matches = re.finditer(site, ref_seq)
        index = 0
        previous_end = 0
        for match in matches:
            # Skip if restriction sequence at start of reference.
            if match.start() == 0:
                continue
            index += 1
            start = 1 if index == 1 else previous_end + 1
            end = match.start() + overhang
            sys.stdout.write(f'{ref}\t{start}\t{end}\t{index}\n')
            previous_end = end
        sys.stdout.write(
            f'{ref}\t{previous_end + 1}\t{len(ref_seq)}\t{index + 1}\n')
        ec = 0
    return ec
Exemple #4
0
def duplicates(in_fastq: str = '-',
               out_fastq: str = '-',
               buffer_size: float = 0.1,
               parallel: int = 1):
    ''' Check FASTQ duplication level. '''

    log = pct.create_logger()

    with contextlib.ExitStack() as cm:
        out_fobj = cm.enter_context(pct.open_smart(out_fastq, mode='w'))
        in_fobj = cm.enter_context(pct.open_smart(in_fastq))
        try:
            p1 = cm.enter_context(
                subprocess.Popen(['awk', 'NR%4 == 2'],
                                 stdin=in_fobj,
                                 stdout=subprocess.PIPE))
            p2 = cm.enter_context(
                subprocess.Popen([
                    'sort', '--parallel', f'{parallel}', '--buffer-size',
                    f'{int(buffer_size*100)}%'
                ],
                                 stdin=p1.stdout,
                                 stdout=subprocess.PIPE,
                                 encoding='utf8'))
            p1.stdout.close()
        except FileNotFoundError as e:
            log.exception(f'{e}\nPlease install awk and sort.')
            sys.exit(1)

        duplicates = defaultdict(int)

        for index, line in enumerate(p2.stdout):
            line = line.rstrip('\n')
            if index == 0:
                copies = 1
            elif line == previous_line:
                copies += 1
            else:
                duplicates[copies] += 1
                copies = 1
            previous_line = line
        duplicates[copies] += 1

        for duplicate_level, ncopies in sorted(duplicates.items()):
            out_fobj.write(f'{duplicate_level}\t{ncopies}\n')

        exit_codes = [p.wait() for p in [p1, p2]]
        log.debug(f'Exit_codes for p1, p2: {exit_codes}.')
        if not all(ec is 0 for ec in exit_codes):
            log.error('A sub-process returned a non-zero exit code.')
Exemple #5
0
def is_valid(read1, read2):

    log = pct.create_logger()

    if read1.qname != read2.qname:
        log.error(f'Qname mismatch: {read1.qname} {read2.qname}. '
                  'Is file name sorted?')
    elif not read1.is_paired and not read2.is_paired:
        log.error(f'{read1.qname} is not paired')
    elif read1.is_read1 == read2.is_read1:
        log.error(f'R1 and R2 flags in {read1.qname} not correctly set')
    elif read1.pnext != read2.left_pos or read2.pnext != read1.left_pos:
        log.error(f'Mate position mismatch in {read1.qname}.')
    else:
        return True
    return False
Exemple #6
0
def truncate(infile, qc, sample, restriction):
    ''' Run main loop. '''

    log = pct.create_logger()

    ligation_seq, restriction_seq = process_restriction(restriction)
    total = 0
    truncated = 0
    truncated_length = 0

    if not sample:
        sample = infile

    with pct.open(infile) as in_obj:

        is_truncated = False
        for index, line in enumerate(in_obj):
            line = line.rstrip('\n')
            # Sequence line
            if index % 4 == 1:
                total += 1
                line = line.upper()
                if ligation_seq in line:
                    line = line[0:line.index(ligation_seq)] + restriction_seq
                    is_truncated = True
                seq_length = len(line)
            # Quality line
            elif index % 4 == 3:
                line = line[0:seq_length]
                if is_truncated:
                    truncated += 1
                    truncated_length += seq_length
                    is_truncated = False
            sys.stdout.write(f'{line}\n')
        try:
            mean_truncated_length = truncated_length / truncated
        except ZeroDivisionError:
            mean_truncated_length = 'na'

        with pct.open(qc, stderr=True, mode='w') as qc_out:
            qc_out.write(
                f'{sample}\tTotal\t{total}\n'
                f'{sample}\tTruncated\t{truncated}\n'
                f'{sample}\tNot truncated\t{total-truncated}\n'
                f'{sample}\tMean truncated length\t{mean_truncated_length}\n')
Exemple #7
0
def digest(infile, restriction):
    ''' Iterate through each infile. '''

    log = pct.create_logger()

    with pct.open(infile) as in_obj:

        header = 1
        for index, line in enumerate(in_obj):
            if line.startswith('>'):
                if header > 1:
                    find_cut_sites(''.join(seqs), ref, restriction)
                ref = line.rsplit()[0][1:]
                log.info(f'Digesting reference {ref}.')
                header += 1
                seqs = []
            elif header == 1 and index == 0:
                log.error(f'FASTA line 1 does not begin with ">".')
                sys.exit(1)
            else:
                seqs.append(line.upper().strip('\n'))
        find_cut_sites(''.join(seqs), ref, restriction)
Exemple #8
0
    Parameters
    ----------
    in_file : str, optional
        Path to input file.
    out_file : str, optional
        Path to output file.
    nsamples : int, optional
        Number of records to sample from input.
    blocksize : int, optional
        Number of consecutive lines per record.
    seed : float, optional
        Seed for random number generator

    """

    log = pct.create_logger()
    log.info(
        f'Sampling {nsamples} records in blocks of {blocksize} from {in_file}')
    random.seed(seed)

    with contextlib.ExitStack() as cm:
        out_fobj = cm.enter_context(pct.open_smart(out_file, mode='w'))
        in_fobj = cm.enter_context(pct.open_smart(in_file))
        log.info(f'Writing output to {out_file}')
        sample = reservoir_sample(in_fobj, nsamples, blocksize)
        for line in sample:
            out_fobj.write(f'{line}\n')


def reservoir_sample(in_fobj: typing.TextIO,
                     nsamples: int = 100_000,
Exemple #9
0
def filter(infile, qc, sample, min_inward, min_outward, min_ditag, max_ditag):
    ''' Iterate through each infile. '''

    log = pct.create_logger()
    inputs = [min_inward, min_outward, max_ditag, min_ditag]
    if all(i is None for i in inputs):
        log.error('No filter settings defined.')
        sys.exit(1)

    if not sample:
        if infile == '-':
            sample = 'stdin'
        else:
            sample = infile

    with pct.open(infile) as in_obj:
        total = 0
        retained = 0
        invalid = 0
        above_ditag = 0
        below_ditag = 0
        same_fragment = 0
        below_min_inward = 0
        below_min_outward = 0

        for line in in_obj:
            if line.startswith("@"):
                sys.stdout.write(line)
            else:
                try:
                    read1 = pct.Sam(line)
                    read2 = pct.Sam(next(in_obj))
                    total += 1
                except StopIteration:
                    log.exception('Odd number of alignments in file.')
                    sys.exit(1)
                if max_ditag is not None:
                    if read1.optional['dt:i'] > max_ditag:
                        above_ditag += 1
                        continue
                if min_ditag is not None:
                    if read1.optional['dt:i'] < min_ditag:
                        below_ditag += 1
                        continue
                if read1.optional['it:Z'] == "cis":
                    if read1.optional['fs:i'] == 0:
                        same_fragment += 1
                        continue
                    if read1.optional['or:Z'] == 'Inward':
                        if min_inward is not None:
                            if read1.optional['is:i'] < min_inward:
                                below_min_inward += 1
                                continue
                    elif read1.optional['or:Z'] == 'Outward':
                        if min_outward is not None:
                            if read1.optional['is:i'] < min_outward:
                                below_min_outward += 1
                                continue
                retained += 1
                sys.stdout.write(read1.get_record())
                sys.stdout.write(read2.get_record())

        with pct.open(qc, stderr=True, mode='w') as qc_out:
            qc_out.write(
                f'{sample}\tTotal\t{total}\n'
                f'{sample}\tRetained\t{retained}\n'
                f'{sample}\tFiltered\t{total - retained}\n'
                f'{sample}\tInvalid\t{invalid}\n'
                f'{sample}\tDitag < {min_ditag}bp\t{above_ditag}\n'
                f'{sample}\tDitag > {max_ditag}bp\t{below_ditag}\n'
                f'{sample}\tSame fragment\t{same_fragment}\n'
                f'{sample}\tInward insert < {min_inward}bp\t{below_min_inward}\n'
                f'{sample}\tOutward insert < {min_outward}bp\t{below_min_outward}\n'
            )