コード例 #1
0
ファイル: pear.py プロジェクト: yemilawal/INNUca
def compress_decompress(compressed_file, decompressed_file, compressed_True):
    run_successfully = False
    malformated_fastq = False
    length_sequence = None

    compression_type = None
    if not compressed_True:
        compression_type = utils.compressionType(compressed_file)

    if compression_type is not None or compressed_True:
        command = ['', '', '--stdout', '--keep', '', '>', '']

        if not compressed_True:
            command[0] = compression_type[0]
            command[1] = '--decompress'
            command[4] = compressed_file
            command[6] = decompressed_file
        else:
            command[0] = 'gzip'
            command[4] = decompressed_file
            command[6] = compressed_file

        run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, True)
        if run_successfully and not compressed_True:
            malformated_fastq, length_sequence = check_uncompression_fastq(decompressed_file)
    elif compression_type is None and not compressed_True:
        run_successfully = True
        malformated_fastq, length_sequence = check_uncompression_fastq(compressed_file)
        decompressed_file = compressed_file

    if malformated_fastq:
        run_successfully = False

    utils.saveVariableToPickle([run_successfully, compressed_file if compressed_True else decompressed_file, length_sequence], os.path.dirname(decompressed_file), os.path.splitext(os.path.basename(decompressed_file))[0])
コード例 #2
0
def countSequencedBases(fastq_file, outdir):
    run_successfully = False
    bases = None

    # Determine compression type
    compression_type = utils.compressionType(fastq_file)
    if compression_type is not None:
        command = [
            compression_type[1], '--keep', '--stdout', fastq_file, '|', 'grep',
            '--after-context=1', '"@"', '|', 'grep', '--invert-match',
            '"^--$"', '|', 'grep', '--invert-match', '"@"', '|', 'wc', ''
        ]

        # Number of characters
        command[18] = '--chars'
        run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(
            command, True, None, False)
        if run_successfully:
            bases = int(stdout.splitlines()[0])

            # Number of lines
            command[18] = '--lines'
            run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(
                command, True, None, False)
            if run_successfully:
                lines = int(stdout.splitlines()[0])
                bases = bases - lines

    utils.saveVariableToPickle([run_successfully, bases], outdir,
                               str('estimate_coverage.' +
                                   os.path.basename(fastq_file)))
コード例 #3
0
ファイル: fastQintegrity.py プロジェクト: INNUENDOWEB/INNUca
def fastQintegrity(fastq, outdir):
    run_successfully = False

    temporary_output_file = os.path.join(
        outdir,
        os.path.splitext(os.path.basename(fastq))[0])

    compression_type = utils.compressionType(fastq)

    encoding = None

    if compression_type is not None:
        command = [
            compression_type[1], '--stdout', '--keep', fastq, '>',
            temporary_output_file
        ]
        run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(
            command, True, None, False)

        if run_successfully:
            encoding, min_reads_length, max_reads_length = run_guess_encoding_single_thread(
                temporary_output_file, None, outdir)

    if os.path.isfile(temporary_output_file):
        os.remove(temporary_output_file)

    utils.saveVariableToPickle(
        [run_successfully, encoding, min_reads_length, max_reads_length],
        outdir, os.path.basename(fastq))
コード例 #4
0
ファイル: assembly_mapping.py プロジェクト: yemilawal/INNUca
def get_sequence_coverage(alignment_file, sequence_to_analyse, outdir,
                          counter):
    position = 0
    coverage = 0

    run_successfully, genome_coverage_data_file = compute_genome_coverage_data(
        alignment_file, sequence_to_analyse, outdir, counter)
    if run_successfully:
        problems_found, position, coverage = calculate_genome_coverage(
            genome_coverage_data_file)
        if not problems_found and position == 0:
            # Assuming SPAdes headers renamed (with sample name at the beginning)
            position = int(sequence_to_analyse.rsplit('_', 6)[4])
    else:
        problems_found = True

    try:
        os.remove(genome_coverage_data_file)
    except Exception as e:
        print(e)

    utils.saveVariableToPickle([
        sequence_to_analyse, run_successfully, problems_found, position,
        coverage
    ], outdir, str('coverage.sequence_' + str(counter)))
コード例 #5
0
def analyse_sequence_data(bam_file, sequence_information, outdir, counter,
                          reference_file, length_extra_seq,
                          minimum_depth_presence, minimum_depth_call,
                          minimum_depth_frequency_dominant_allele):
    multiple_alleles_found = percentage_absent = percentage_lowCoverage = meanCoverage = None

    # Create vcf file (for multiple alleles check)
    run_successfully, gene_vcf = create_vcf(bam_file,
                                            sequence_information['header'],
                                            outdir, counter, reference_file)

    if run_successfully:
        # Create coverage tab file
        run_successfully, gene_coverage = assembly_mapping.compute_genome_coverage_data(
            bam_file, sequence_information['header'], outdir, counter)

        if run_successfully:
            variants = get_variants(gene_vcf)
            coverage = get_coverage(gene_coverage)

            multiple_alleles_found = find_multiple_alleles(
                variants, sequence_information['length'],
                minimum_depth_presence, minimum_depth_call,
                minimum_depth_frequency_dominant_allele, length_extra_seq)

            percentage_absent, percentage_lowCoverage, meanCoverage = get_coverage_report(
                coverage, sequence_information['length'],
                minimum_depth_presence, minimum_depth_call, length_extra_seq)

    utils.saveVariableToPickle([
        run_successfully, counter, multiple_alleles_found, percentage_absent,
        percentage_lowCoverage, meanCoverage
    ], outdir, str('coverage_info.' + str(counter)))
コード例 #6
0
ファイル: download.py プロジェクト: B-UMMI/getSeqENA
def download_with_sra_prefetch(aspera_key, outdir, pickle_prefix, ena_id):
    command = ['prefetch', '', ena_id]

    if aspera_key is not None:
        _, ascp, _ = utils.runCommandPopenCommunicate(['which', 'ascp'], False, None, False)
        command[1] = '-a {ascp}|{aspera_key}'.format(ascp=ascp.splitlines()[0], aspera_key=aspera_key)

    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, 3600, True)
    if run_successfully:
        _, prefetch_outdir, _ = utils.runCommandPopenCommunicate(['echo', '$HOME/ncbi/public/sra'], True, None, False)

        try:
            os.rename(os.path.join(prefetch_outdir.splitlines()[0], ena_id + '.sra'),
                      os.path.join(outdir, ena_id + '.sra'))
        except OSError as e:
            print('Found the following error:'
                  '{}'.format(e))

            from shutil import copy as shutil_copy

            shutil_copy(os.path.join(prefetch_outdir.splitlines()[0], ena_id + '.sra'),
                        os.path.join(outdir, ena_id + '.sra'))
            os.remove(os.path.join(prefetch_outdir.splitlines()[0], ena_id + '.sra'))

    utils.saveVariableToPickle(run_successfully, outdir, pickle_prefix + '.' + ena_id)
コード例 #7
0
ファイル: download.py プロジェクト: B-UMMI/getSeqENA
def gzip_files(file_2_compress, pickle_prefix, outdir):
    if file_2_compress.endswith('.temp'):
        out_file = os.path.splitext(file_2_compress)[0]
    else:
        out_file = file_2_compress

    command = ['gzip', '--stdout', '--best', file_2_compress, '>', str(out_file + '.gz')]
    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, True)
    if run_successfully:
        os.remove(file_2_compress)

    utils.saveVariableToPickle(run_successfully, outdir, str(pickle_prefix + '.' + os.path.basename(file_2_compress)))
コード例 #8
0
ファイル: download.py プロジェクト: B-UMMI/getSeqENA
def download_with_aspera(aspera_file_path, aspera_key, outdir, pickle_prefix, sra, ena_id):
    command = ['ascp', '-QT', '-l', '300m', '', '-i', aspera_key, '', outdir]
    if not sra:
        command[4] = '-P33001'
        command[7] = str('era-fasp@' + aspera_file_path)
        pickle = pickle_prefix + '.' + aspera_file_path.rsplit('/', 1)[1]
    else:
        command[7] = '[email protected]:/sra/sra-instant/reads/ByRun/sra/{a}/{b}/{c}/{c}.sra'.format(
            a=ena_id[:3], b=ena_id[:6], c=ena_id)
        pickle = pickle_prefix + '.' + ena_id

    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, 3600, True)

    utils.saveVariableToPickle(run_successfully, outdir, pickle)
コード例 #9
0
ファイル: campyGenomes.py プロジェクト: B-UMMI/campyGenomes
def downloadAndINNUca(outdir, run_ID, asperaKey, threads):
    start_time = time.time()
    temp_file = os.path.join(outdir, run_ID + '.temp.runID_fileList.txt')
    with open(temp_file, 'wt') as writer:
        writer.write(run_ID + '\n')

    command = [
        'getSeqENA.py', '-l', temp_file, '-o', outdir, '-a', asperaKey,
        '--downloadLibrariesType', 'PE'
    ]
    getSeqENA_run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(
        command, False, None)

    os.remove(temp_file)

    sample_directory = os.path.join(outdir, run_ID, '')

    innuca_run_successfully = False
    if getSeqENA_run_successfully:
        command = [
            'INNUca.py', '-i', sample_directory, '-s',
            '"Campylobacter jejuni"', '-g', '1.6', '-o', sample_directory,
            '-j',
            str(threads), '--jarMaxMemory', 'auto'
        ]
        innuca_run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(
            command, False, None)

        innuca_dir = os.path.join(sample_directory, run_ID, '')
        files = [
            f for f in os.listdir(innuca_dir) if not f.startswith('.')
            and os.path.isfile(os.path.join(innuca_dir, f))
        ]
        for file_innuca in files:
            shutil.move(os.path.join(innuca_dir, file_innuca),
                        os.path.join(sample_directory, file_innuca))
        utils.removeDirectory(innuca_dir)

    removeFiles(sample_directory, '.gz')
    removeFiles(sample_directory, '.log')
    removeFiles(sample_directory, '.cpu.txt')

    if innuca_run_successfully:
        time_taken = utils.runTime(start_time)
        utils.saveVariableToPickle(time_taken, sample_directory,
                                   run_ID + '_downloadAndINNUca_time')

    utils.saveVariableToPickle(innuca_run_successfully, sample_directory,
                               run_ID + '_run_successfully')
コード例 #10
0
ファイル: download.py プロジェクト: B-UMMI/getSeqENA
def download_with_wget(ftp_file_path, outdir, pickle_prefix, sra, ena_id):
    command = ['wget', '--tries=1', '', '-O', '']
    if not sra:
        command[2] = ftp_file_path
        file_download = ftp_file_path.rsplit('/', 1)[1]
        command[4] = os.path.join(outdir, file_download)
        pickle = pickle_prefix + '.' + file_download
    else:
        command[2] = 'ftp://ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByRun/sra/{a}/{b}/{c}/{c}.sra'.format(
            a=ena_id[:3], b=ena_id[:6], c=ena_id)
        command[4] = os.path.join(outdir, ena_id + '.sra')
        pickle = pickle_prefix + '.' + ena_id
    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, 3600, True)

    utils.saveVariableToPickle(run_successfully, outdir, pickle)
コード例 #11
0
def guess_encoding(fastq, number_reads_access_None_all, outdir):
    gmin, gmax = 99, 0
    valid_encodings = None
    reads_length = []
    with open(fastq, 'rtU') as reader:
        for i, line in enumerate(reader):
            if number_reads_access_None_all is None or (i + 1) / 4 <= number_reads_access_None_all:
                if (i + 1) % 4 == 0:
                    if len(line) > 0:
                        reads_length.append(len(line.splitlines()[0]))
                        lmin, lmax = get_qual_range(line.splitlines()[0])
                        if lmin < gmin or lmax > gmax:
                            gmin, gmax = min(lmin, gmin), max(lmax, gmax)
                            valid_encodings = get_encodings_in_range(gmin, gmax)

    utils.saveVariableToPickle([fastq, valid_encodings, min(reads_length) if len(reads_length) > 0 else None, max(reads_length) if len(reads_length) > 0 else None], outdir, 'encoding' + '.' + os.path.splitext(os.path.basename(fastq))[0])
コード例 #12
0
def downloadWithSRAprefetch(asperaKey, outdir, pickle_prefix, ena_id):
    command = ['prefetch', '', ena_id]

    if asperaKey is not None:
        ignore, ascp, ignore = utils.runCommandPopenCommunicate(
            ['which', 'ascp'], False, None, False)
        command[1] = '-a {ascp}|{asperaKey}'.format(ascp=ascp.splitlines()[0],
                                                    asperaKey=asperaKey)

    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(
        command, False, 3600, True)
    if run_successfully:
        ignore, prefetch_outdir, ignore = utils.runCommandPopenCommunicate(
            ['echo', '$HOME/ncbi/public/sra'], True, None, False)
        os.rename(
            os.path.join(prefetch_outdir.splitlines()[0], ena_id + '.sra'),
            os.path.join(outdir, ena_id + '.sra'))

    utils.saveVariableToPickle(run_successfully, outdir,
                               pickle_prefix + '.' + ena_id)
コード例 #13
0
ファイル: fastQintegrity.py プロジェクト: INNUENDOCON/INNUca
def fastQintegrity(fastq, outdir):
    run_successfully = False

    temporary_output_file = os.path.join(outdir, os.path.splitext(os.path.basename(fastq))[0])

    compression_type = utils.compressionType(fastq)

    encoding, min_reads_length, max_reads_length = None, None, None

    if compression_type is not None:
        command = [compression_type[1], '--stdout', '--keep', fastq, '>', temporary_output_file]
        run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, False)

        if run_successfully:
            encoding, min_reads_length, max_reads_length = run_guess_encoding_single_thread(temporary_output_file, None, outdir)

    if os.path.isfile(temporary_output_file):
        os.remove(temporary_output_file)

    utils.saveVariableToPickle([run_successfully, encoding, min_reads_length, max_reads_length], outdir, os.path.basename(fastq))
コード例 #14
0
def get_sequence_coverage(alignment_file, sequence_to_analyse, outdir, counter):
    problems_found = False
    position = 0
    coverage = 0

    run_successfully, genome_coverage_data_file = compute_genome_coverage_data(alignment_file, sequence_to_analyse, outdir, counter)
    if run_successfully:
        problems_found, position, coverage = calculate_genome_coverage(genome_coverage_data_file)
        if not problems_found and position == 0:
            # Assuming SPAdes headers renamed (with sample name at the beginning)
            position = int(sequence_to_analyse.rsplit('_', 6)[4])
    else:
        problems_found = True

    try:
        os.remove(genome_coverage_data_file)
    except Exception as e:
        print e

    utils.saveVariableToPickle([sequence_to_analyse, run_successfully, problems_found, position, coverage], outdir, str('coverage.sequence_' + str(counter)))
コード例 #15
0
def analyse_sequence_data(bam_file, sequence_information, outdir, counter,
                          reference_file, length_extra_seq,
                          minimum_depth_presence, minimum_depth_call,
                          minimum_depth_frequency_dominant_allele):
    percentage_absent = None
    percentage_lowCoverage = None
    meanCoverage = None
    number_diferences = 0

    # Create vcf file (for multiple alleles check)
    run_successfully, gene_vcf = create_vcf(bam_file,
                                            sequence_information['header'],
                                            outdir, counter, reference_file)

    if run_successfully:
        # Create coverage tab file
        run_successfully, gene_coverage = compute_genome_coverage_data(
            bam_file, sequence_information['header'], outdir, counter)

        if run_successfully:
            variants = get_variants(gene_vcf)

            coverage = get_coverage(gene_coverage)

            run_successfully, number_multi_alleles, consensus_sequence, number_diferences = create_sample_consensus_sequence(
                outdir, sequence_information['header'], reference_file,
                variants, minimum_depth_presence, minimum_depth_call,
                minimum_depth_frequency_dominant_allele,
                sequence_information['sequence'], length_extra_seq)

            percentage_absent, percentage_lowCoverage, meanCoverage = get_coverage_report(
                coverage, sequence_information['length'],
                minimum_depth_presence, minimum_depth_call, length_extra_seq)

    utils.saveVariableToPickle([
        run_successfully, counter, number_multi_alleles, percentage_absent,
        percentage_lowCoverage, meanCoverage, consensus_sequence,
        number_diferences
    ], outdir, str('coverage_info.' + str(counter)))
コード例 #16
0
def countSequencedBases(fastq_file, outdir):
    run_successfully = False
    bases = None

    # Determine compression type
    compression_type = utils.compressionType(fastq_file)
    if compression_type is not None:
        command = [compression_type[1], '--keep', '--stdout', fastq_file, '|', 'grep', '--after-context=1', '"@"', '|', 'grep', '--invert-match', '"^--$"', '|', 'grep', '--invert-match', '"@"', '|', 'wc', '']

        # Number of characters
        command[18] = '--chars'
        run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, False)
        if run_successfully:
            bases = int(stdout.splitlines()[0])

            # Number of lines
            command[18] = '--lines'
            run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, False)
            if run_successfully:
                lines = int(stdout.splitlines()[0])
                bases = bases - lines

    utils.saveVariableToPickle([run_successfully, bases], outdir, str('estimate_coverage.' + os.path.basename(fastq_file)))
コード例 #17
0
def fastQintegrity(fastq, outdir):
    run_successfully = False

    temporary_output_file = os.path.join(
        outdir,
        os.path.splitext(os.path.basename(fastq))[0])

    command = ['', '--stdout', '--keep', fastq, '>', temporary_output_file]

    filetype = utils.compressionType(fastq)
    if filetype == 'gz':
        command[0] = 'gunzip'
    elif filetype == 'bz2':
        command[0] = 'bunzip2'

    if command[0] != '':
        run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(
            command, True, None, False)

    if os.path.isfile(temporary_output_file):
        os.remove(temporary_output_file)

    utils.saveVariableToPickle(run_successfully, outdir,
                               os.path.basename(fastq))
コード例 #18
0
ファイル: download.py プロジェクト: bfrgoncalves/ReMatCh
def downloadWithFtp(ftp_file_path, outdir, pickle_prefix):
	file_download = ftp_file_path.rsplit('/', 1)[1]
	command = ['wget', ftp_file_path, '-O', os.path.join(outdir, file_download)]
	run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, 3600, True)

	utils.saveVariableToPickle(run_successfully, outdir, str(pickle_prefix + '.' + file_download))
コード例 #19
0
ファイル: download.py プロジェクト: bfrgoncalves/ReMatCh
def downloadWithAspera(aspera_file_path, asperaKey, outdir, pickle_prefix):
	command = ['ascp', '-QT', '-l', '300m', '-i', asperaKey, str('era-fasp@' + aspera_file_path), outdir]
	run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, 3600, True)

	utils.saveVariableToPickle(run_successfully, outdir, str(pickle_prefix + '.' + aspera_file_path.rsplit('/', 1)[1]))