Exemple #1
0
def cat_counter_references(counter_references=None, target_dir=curdir,
                           path_to_bowtie2='bowtie2',
                           logger=None, **kwargs):
    if counter_references is None:
        return
    try:
        makedirs(target_dir, mode=0755)
    except OSError:
        pass
    debug('Validating counter-references and building counter-reference index')
    valid_references = validate_references(references=counter_references,
                                           target_dir=target_dir,
                                           path_to_bowtie2=path_to_bowtie2,
                                           logger=logger,
                                           environ_key=
                                           'SOT_DEFAULT_COUNTER_REFERENCES')
    crefs_fa = open(join(target_dir, 'counter_references.fa'), 'w')
    for ref in valid_references:
        Popen([path_to_bowtie2 + '-inspect', ref], stdout=crefs_fa).wait()
    crefs_index = join(target_dir, counter_references)
    args = [path_to_bowtie2 + '-build', crefs_fa, crefs_index]
    P = Popen(args, stderr=PIPE)
    stderr = P.communicate()[1]
    if stderr.startswith('Error'):
        critical(stderr)
        critical('No counter-references will be used.')
    return crefs_index
Exemple #2
0
def validate_references(references=None, path_to_bowtie2='bowtie2',
                        logger=None, environ_key='SOT_DEFAULT_REFERENCES',
                        target_dir=curdir,
                        **kwargs):
    makedirs(target_dir, mode=0755)
    debug('Validating references')
    new_references = []
    if references is None:
        if environ_key in environ:
            references = environ[environ_key].split()
        else:
            critical('no reference genomes specified')
            return []

    for r in references:
        bowtie2_index = find_bowtie2_index(r, path_to_bowtie2=path_to_bowtie2)
        if bowtie2_index is None:
            if exists(r):
                debug('Attempting to build bowtie2 index from %s' % r)
                new_index = fasta_to_bowtie2(r, target_dir=target_dir,
                                             path_to_bowtie2=path_to_bowtie2)
                if new_index is not None:
                    new_references.append(new_index)
                    continue
                else:
                    critical('Failed to build bowtie2 index.')
            critical('bowtie2 could not find the index for %s', r)
            critical('we will not align to %s', r)
        else:
            new_references.append(bowtie2_index)
    return new_references
Exemple #3
0
def find_bowtie_index(r, path_to_bowtie='bowtie'):
    """check for bowtie index as given.
       return True if found, else return False
    """
    args = [path_to_bowtie + '-inspect', '-v', '-s', r]
    debug(' '.join(args))
    P = Popen(args, stdout=open(devnull, 'w'), stderr=PIPE, cwd=mkdtemp())
    stderr = P.communicate()[1].splitlines()
    if not stderr[0].startswith('Could not locate'):
        for line in stderr:
            if line.startswith('Opening'):
                index_ebwt1 = line[(1+line.find('"')):line.rfind('"')]
                index_basename = index_ebwt1[0:index_ebwt1.find('.1.ebwt')]
                return index_basename
    rprime = join(getcwd(), r)
    args = [path_to_bowtie + '-inspect', '-v', '-s', rprime]
    debug(' '.join(args))
    P = Popen(args, stdout=open(devnull, 'w'), stderr=PIPE, cwd=mkdtemp())
    stderr = P.communicate()[1].splitlines()
    if not stderr[0].startswith('Could not locate'):
        for line in stderr:
            if line.startswith('Opening'):
                index_ebwt1 = line[(1+line.find('"')):line.rfind('"')]
                index_basename = index_ebwt1[0:index_ebwt1.find('.1.ebwt')]
                return index_basename
    return None
Exemple #4
0
def find_bowtie2_index(r, path_to_bowtie2='bowtie2'):
    """check for bowtie2 index as given.
    return True if found, else return False
    """
    args = [path_to_bowtie2 + '-inspect', '-v', '-s', r]
    debug(' '.join(args))
    P = Popen(args, stdout=open(devnull, 'w'), stderr=PIPE, cwd=mkdtemp())
    stderr = P.communicate()[1].splitlines()
    if not stderr[0].startswith('Could not locate'):
        for line in stderr:
            if line.startswith('Opening'):
                index_bt2 = line[(1 + line.find('"')):line.rfind('"')]
                index_basename = index_bt2[0:index_bt2.find('.1.bt2')]
                return index_basename
    for d in [getcwd(), os.path.split(path_to_bowtie2)[0],
              join(os.path.split(path_to_bowtie2)[0], 'indexes')]:
        rprime = join(d, r)
        args = [path_to_bowtie2 + '-inspect', '-v', '-s', rprime]
        debug(' '.join(args))
        P = Popen(args, stdout=open(devnull, 'w'), stderr=PIPE, cwd=mkdtemp())
        stderr = P.communicate()[1].splitlines()
        if not stderr[0].startswith('Could not locate'):
            for line in stderr:
                if line.startswith('Opening'):
                    index_bt2 = line[(1 + line.find('"')):line.rfind('"')]
                    index_basename = index_bt2[0:index_bt2.find('.1.bt2')]
                    return index_basename
    return None
def remove_reads(parsed_filename, remove_all=False, sam_out=False,
                 debug=False, **kwargs):
    '''
note: you must be looking at a sorted file, or this won't work
'''

    if sam_out: write_opts = 'w'
    else: write_opts = 'wb'

    with pysam.Samfile(parsed_filename.mapped_file) as mapped:
        if remove_all:
            reads_to_remove = set([read.qname for read in mapped])
        else:
            reads_to_remove = set([read.qname for read in mapped
                                   if is_mapped(read)])

    if debug:
        scripter.debug('Found {!s} reads in {!s}'.format(len(reads_to_remove),
                                                   parsed_filename.mapped_file))
    
    with pysam.Samfile(parsed_filename.input_file) as bam_file:
        with pysam.Samfile(parsed_filename.output_file, write_opts,
                           template = bam_file) as out_bam_file:
            for read in bam_file:
                if read.qname not in reads_to_remove:
                    out_bam_file.write(read)

    return
    def __init__(self, filename, sam_out=False, *args, **kwargs):
        super(SubtractBamFilenameParser, self).__init__(filename,
                                                        sam_out=sam_out,
                                                        *args,
                                                        **kwargs)
        fext = os.path.splitext(filename)[1].rstrip(os.extsep)
        if not (fext == 'sam' or fext =='bam'): raise InvalidFileException
        if not self.is_dummy_file:
            # check for the mapped_file
            input_dir_parts = self.input_dir.split(os.sep)
            glob_path = ['mapped', input_dir_parts[0], '*'] + \
                        input_dir_parts[2:] + \
                        [os.path.basename(self.input_file)]
            potential_filenames = glob.glob(os.sep.join(glob_path))

            if len(potential_filenames) is 1:
                self.mapped_file = potential_filenames[0]
            elif len(potential_filenames) is 0:
                raise scripter.Usage('Could not find mapped file')
            else:
                raise scripter.Usage('Ambiguous mapped file', *potential_filenames)
            scripter.debug('Mapped file will be', self.mapped_file)

            if sam_out:
                self.output_file = os.sep.join([self.output_dir,
                                               self.with_extension('sam')])
            else:
                self.output_file = os.sep.join([self.output_dir,
                                               self.with_extension('bam')])
            scripter.debug('Output file will be', self.output_file)
Exemple #7
0
    def __init__(self, filename, *args, **kwargs):
        super(BowtieFilenameParser, self).__init__(filename, *args, **kwargs)
        open_func, format = discover_file_format(filename)
#        self.split_file = False
        self.format = format
        self.open_func = open_func
        self.second_file = None
        if format == 'SAM' or format == 'BAM':
            self.use_pysam = True
            # try to open the file so we're sure it works
            f = pysam.Samfile(filename)
            aread = f.next()
            self.paired_end = aread.is_paired
            del f, aread
            self.fastq_source = 'Unknown'
        elif format == 'FASTQ':
            self.use_pysam = False
            self.check_paired_end()
            if len(self.protoname.split('.')) > 6:
                self.fastq_source = self.protoname.split('.')[6]
            else:
                self.fastq_source = 'Unknown'
        else:
            if self.second_file is None:
                scripter.debug('Skipping file %s with dubious format',
                               filename)
                raise scripter.InvalidFileException
            else:
                scripter.debug('Skipping files %s, %s with dubious'
                               'format', filename, self.second_file)
                raise scripter.InvalidFileException
Exemple #8
0
def fastq_to_bowtie(fasta_file, target_dir=curdir, path_to_bowtie='bowtie'):
    """given a filename, makes a bowtie index
    if that file is a FASTA file
    """
    if exists(fasta_file):
        f = open(fasta_file, 'rU')
        for line in f:
            if line.startswith('#'): continue
            elif line.startswith('>'):
                args = [path_to_bowtie + '-build', fasta_file,
                        join(target_dir, fasta_file)]
                debug(' '.join(args))
                P = Popen(args, stdout=open(devnull, 'w'), stderr=PIPE)
                stderr = P.communicate()[1]
                if len(stderr.splitlines()) == 0:
                    return join(getcwd(), target_dir, fasta_file)
                elif stderr.splitlines()[0].startswith('Error'): return None
                else: return join(getcwd(), target_dir, fasta_file)
    return None
def fasta_to_bwa(fasta_file, path_to_bwa='bwa'):
    """given a filename, makes a bwa index
    if that file is a FASTA file
    """
    if exists(fasta_file):
        f = open(fasta_file, 'rU')
        for line in f:
            if line.startswith('#'):
                continue
            elif line.startswith('>'):
                args = [path_to_bwa, 'index', fasta_file]
                debug(" ".join(args))
                P = Popen(args, stdout=open(devnull, 'w'), stderr=PIPE)
                stderr = P.communicate()[1]
                if stderr.splitlines()[0].startswith('Error'):
                    return None
                else:
                    return fasta_file
    return None
def validate_references(references=None, path_to_bwa='bwa',
                        logger=None, environ_key='SOT_DEFAULT_REFERENCES',
                        target_dir=curdir,
                        **kwargs):
    ## Make the output directory, complain if we fail
    #if os.path.exists(target_dir):
    #    debug('Output directory %s already exists', target_dir)
    #else:
    #    debug('Creating directory "%s"', target_dir)
    #    makedirs(target_dir, mode=0755)
    #    if not os.path.exists(target_dir):
    #        raise IOError('Could not create directory %s' % target_dir)
    debug('Validating references')
    new_references = []
    if references is None:
        if environ_key in environ:
            references = environ[environ_key].split()
        else:
            critical('no reference genomes specified')
            return []

    for r in references:
        if exists(r):
            if not all(map(exists, [r + '.amb', r + '.ann', r + '.bwt',
                                    r + '.pac', r + '.sa'])):
                info('Attempting to build bwa index from %s' % r)
                new_index = fasta_to_bwa(r, target_dir=target_dir,
                                         path_to_bwa=path_to_bwa)
                if new_index is not None:
                    new_references.append(new_index)
                    continue
                else:
                    critical('Failed to build bwa index.')
            else:
                debug('Found bwa index for %s' % r)
                new_references.append(r)
        else:
            critical('bwa could not find the reference %s', r)
            critical('we will not align to %s', r)
    return new_references
Exemple #11
0
 def __init__(self, filename, include_width_in_name=False,
              target=None, motif_file='unknown_motif', genome=None,
              *args, **kwargs):
     fext = splitext(filename)[1].lstrip(extsep)
     if fext == 'bed':
         self.is_bed = True
         self.is_xls = False
     elif fext == 'xls':
         self.is_bed = False
         self.is_xls = True
     else: raise InvalidFileException
     motif_name = sub('\W', '_', abspath(motif_file))
     target = target + sep + motif_name
     super(PeaksFilenameParser, self).__init__(filename,
                                          target = target,
                                          *args, **kwargs)
     self.fasta_file = None
     for file_extension in ['fa', 'fasta', 'FA', 'FASTA']:
         fasta_file = join(self.input_dir,
                         extsep.join([self.protoname, file_extension]))
         debug("Trying", fasta_file)
         if exists(fasta_file):
             self.fasta_file = fasta_file
             debug("Using", fasta_file)
             break
     if self.fasta_file is None:
         warning('Could not find the FASTA file for %s',
                       self.input_file)
         if genome is None:
             raise Usage("Could not find the FASTA file for ", self.input_file,
                         " and no genome was specified")
         else:
             t = try_to_find_genome(genome)
             if t is None:
                 raise Usage("Could not find the FASTA file for ", self.input_file,
                             " and failed to use %s" % genome)
             else:
                 fasta_file = join(self.input_dir, '%s.fa' % self.protoname)
                 debug('Creating FASTA file %s for %s using %s',
                                fasta_file, self.input_file, genome)
                 input_fhd = open(self.input_file, 'rU')
                 fasta_fhd = open(fasta_file, 'w')
                 twobit_reader(t, input_stream=input_fhd,
                               write=fasta_fhd.write)
                 fasta_fhd.close()
                 self.fasta_file = fasta_file
Exemple #12
0
 def __init__(self, filename, controls = {}, *args, **kwargs):
     if not os.path.splitext(filename)[1] == '.bam':
         raise scripter.InvalidFileException(filename)
     super(BAMFilenameParser, self).__init__(filename, *args, **kwargs)
     
     sample = self.protoname
     control_files = [v[1] for v in controls.values()]
     # check controls
     if controls.has_key(sample):
         sample_name, control = controls[sample]
         scripter.debug('%s has control %s', sample, control)
         if control is None:
             self.control_file = None
         else:
             self.control_file = os.path.join(self.input_dir,
                                              control + '.bam')
     if controls.has_key(self.input_file):
         sample_name, control = controls[self.input_file]
         scripter.debug('%s has control %s', self.input_file, control)
         if control is None:
             self.control_file = None
         else:
             self.control_file = control
     elif sample in control_files or self.input_file in control_files:
         scripter.debug('%s is a control, aborting', sample)
         raise scripter.InvalidFileException
     else:
         scripter.debug('%s has no control indicated, continuing anyway',
                        sample)
         # not in setup.txt, make an entry in controls
         self.control_file = None
         sample_name = sample
         controls[sample] = (sample, None)
         
     self.sample_name = sample_name
     self.output_dir = os.path.join(self.output_dir, sample_name)
    def __init__(self, filename, verbose=False, *args, **kwargs):
        super(BarcodeFilenameParser, self).__init__(filename,
                                                    *args, **kwargs)
        protoname = self.protoname
        # check for old-style
        if os.path.splitext(protoname)[-3:] == 'all':
            protoname = protoname[0:-4]

        # check if this is a paired-end file
        # if so, grab its partner
        input_file = self.input_file
        illumina_name = os.path.basename(input_file)

        # try new style first
        new_info = get_new_pair_info(illumina_name)
        if new_info is not None:
            scripter.debug('NOTICE: Detected new-style paired read file.')
            read = new_info[0]
            if read == 'R2':
                scripter.debug('This is the second file, ignoring it.')
                raise scripter.InvalidFileException(input_file)
            elif read == 'R1':
                second_file = os.path.join(self.input_dir, new_info[1])
                try:
                    scripter.assert_path(second_file)
                    scripter.debug('Found %s', second_file)
                    self.second_file = second_file
                    self.protoname2 = os.path.splitext(
                        os.path.basename(second_file))[0]
                    paired_end = True
                except IOError:
                    scripter.debug('Failed to find paired end file')
                    paired_end = False
            else:
                scripter.debug('Failed to find paired end')
                paired_end = False
        elif illumina_name.count('_') >= 3:
            scripter.debug('NOTICE: Detected paired read file.')
            iln_parts = illumina_name.split('_')
            if iln_parts[2] == '1':
                scripter.debug('Attempting to find second file.')

                second_file = os.sep.join([self.input_dir,
                                           '_'.join(iln_parts[0:2] + ['2']
                                                    + iln_parts[3:])])
                try:
                    scripter.assert_path(second_file)
                    scripter.debug('Found %s', second_file)
                    self.second_file = second_file
                    self.protoname2 = os.path.splitext(
                        os.path.basename(second_file))[0]
                    paired_end = True
                except IOError:
                    scripter.debug('Failed to find paired end file')
                    paired_end = False
            elif iln_parts[2] == '2':
                scripter.debug('This is the second file, ignoring it.')
                raise scripter.InvalidFileException(input_file)
            else:
                scripter.debug('Failed to find paired end')
                paired_end = False
        else:
            paired_end = False
        self.paired_end = paired_end
Exemple #14
0
    def check_paired_end(self):
        # check if this is a paired-end file
        # if so, grab its partner
        seqfile_name = os.path.basename(self.input_file)
        pair_info = get_pair_info(seqfile_name)
        if pair_info is None:
            pair_info = get_new_pair_info(seqfile_name)
#            if pair_info is not None: self.split_file = True
        if pair_info is not None:
            pair_index = pair_info[0]
            second_name = pair_info[1]
            new_name = pair_info[2]
            scripter.debug('NOTICE: Detected paired read file.')
            if pair_index == '1':
                scripter.debug('Attempting to find second file.')

                self.second_file = os.sep.join([self.input_dir, second_name])
                self.protoname = os.path.splitext(new_name)[0]
                scripter.debug('Found %s', self.second_file)
                try:
                    scripter.assert_path(self.second_file)
                    self.paired_end = True
                except IOError:
                    scripter.debug('Failed to find paired end file')
                    self.paired_end = False
            elif pair_index == '2':
                scripter.debug('This is the second file, ignoring it.')
                raise scripter.InvalidFileException
            else:
                scripter.debug('Failed to find paired end')
                self.paired_end = False
        else:
            scripter.debug('This file contains single-end reads.')
            self.paired_end = False
def main():
    """
    runs the main checkmyclones script
    """
    e = scripter.Environment(doc=__doc__, version=VERSION, handle_files=False)
    parser = e.argument_parser
    parser.add_argument(
        "--path-to-gbdb",
        default="/gbdb",
        help='Location of "gdbdb" or 2bit files. If gbdb is not in /gbdb or C:\gbdb, specify the path here',
    )
    ggroup = parser.add_mutually_exclusive_group()
    ggroup.add_argument(
        "--genome", help="Use 2bit file foo as reference genome (Looks also for {path-to-gbdb}/foo/foo.2bit))"
    )
    ggroup.add_argument("--hg18", const="hg18", action="store_const", help="Shortcut for --genome hg18")
    ggroup.add_argument("--hg19", const="hg19", action="store_const", help="Shortcut for --genome hg19")
    ggroup.add_argument("--mm9", const="mm9", action="store_const", help="Shortcut for --genome mm9")
    parser.add_argument("--reverse-orientation", action="store_true", help="Check only the reverse orientation")
    parser.add_argument("--both-orientations", action="store_true", help="Check forward and reverse orientations")
    parser.add_argument("--clones", nargs="+", help="list of files that contain clone sequences")
    parser.add_argument("--references", nargs="*", help="list of files that contain reference sequences")
    parser.add_argument("--bed-reference", help="Use the regions listed in the bed file as reference sequences")
    parser.add_argument("--only-use-references", nargs="*", help="Use the only regions with the following names")
    parser.set_defaults(**{"genome": "hg19", "logging_level": WARNING})
    args = parser.parse_args()
    context = vars(args)
    scripter.LOGGER.setLevel(context["logging_level"])
    clones = load_all_seqs(context["clones"], recursive=context["recursive"])
    ref_seqs = []
    if len(clones) == 0:
        raise Usage("Could not find any clone sequences")
    if context["references"] is None and context["bed_reference"] is None:
        raise Usage("No reference sequences specified")
    else:
        if context["bed_reference"] is not None:
            genome = find_2bit_file(context["genome"], context["path_to_gbdb"])
            print "Fetching sequences from %s using %s" % (context["bed_reference"], genome)
            ref_seqs.extend(read_bed_file(context["bed_reference"], genome=genome))
        if context["references"] is not None:
            ref_seqs.extend(load_all_seqs(context["references"], recursive=context["recursive"]))
        specified_references = context["only_use_references"]
        if specified_references is not None:
            good_name = lambda ref: real_name(ref.Name) in specified_references
            ref_seqs = filter(good_name, ref_seqs)
        if len(ref_seqs) == 0:
            raise Usage("Could not find any reference sequences")
    signal.signal(signal.SIGCHLD, signal.SIG_DFL)
    debug("multiprocessing enabled")
    p = multiprocessing.Pool(processes=context["num_cpus"])
    debug("Initialized pool of %d workers", context["num_cpus"])
    results = []
    forward = not context["reverse_orientation"]
    rc = context["reverse_orientation"] or context["both_orientations"] or False
    for ref in ref_seqs:
        print "Loaded reference %s" % ref.Name
    for clone in clones:
        p.apply_async(announce_first, (clone,), context)
        for ref in ref_seqs:
            if forward:
                r = p.apply_async(compare_clone_to_ref, (clone, ref), context)
                results.append(r)
            if rc:
                r = p.apply_async(compare_clone_to_ref, (clone.rc(), ref), context)
                results.append(r)
    p.close()
    p.join()
    result_values = []
    for r in results:
        current_pickle = r.get()
        current_result = loads(current_pickle)
        if current_result is None:
            continue
        else:
            result_values.append(current_result)
    all_matches = []
    for clone_name, group in groupby(result_values, key=itemgetter(0)):
        alns = map(itemgetter(1), list(group))
        is_matched = lambda aln: not aln.is_truncated and not aln.has_gaps
        matches = filter(is_matched, alns)
        if len(matches) > 0:
            all_matches.extend(alns)
            continue
        elif len(alns) == 0:
            print "No match for %s" % clone_name
            continue
        else:
            print_good_alns(alns)
    print_matched_alns(all_matches)
    return