Beispiel #1
0
def label_multimapping(kmers_trie, filename, n):
    """Read multimapping k-mers and counts from file and add counts to trie.

    For each kmer and its count, find it in input trie and update the count
    of the kmer stored in position 0 of the array in the value of the trie.

    Args:
    kmers_trie: trie.trie object of the form described in and returned
                by build_kmers_trie()
    filename: name of file with k-mers and counts for k-mers occurring too
              many times in the genome,
              e.g., file <name>/<name>_kmers_counts.txt.gz
              produced by kmers.sort_count_kmers();
              assume file is gzipped
    
    Return:
    modified input trie.trie object
    """
    util.check_file_exists(filename)
    f = gzip.open(filename)
    count_labeled = 0
    for line in f:
        kmer, count = line.split()
        index = get_num(kmer, n)
        kmer2 = kmer[n:]

        if kmers_trie[index].has_key(kmer2):
            kmers_trie[index][kmer2][0] = count
            count_labeled += 1
    f.close()
    print '%s k-mers assigned counts' % count_labeled
    return kmers_trie
Beispiel #2
0
    def load_base_path(self, pth):
        """Load using a base path folder"""
        # Allow for user and environment variables
        pth = os.path.expanduser(pth)
        pth = os.path.expandvars(pth)
        pth = os.path.normpath(pth)

        folder, filename = self.find_config_file(pth)

        # check that user didn't enter a file instead of a folder
        # if os.path.isfile(pth):
        # log.error("The second argument of the commandline currently
        # points to a file, but it should point to the folder that contains
        # the alignment and .cfg files, please check.")
        # raise ConfigurationError

        self.set_base_path(folder)

        # From now on we refer to relative paths
        config_path = os.path.join(self.base_path, filename)
        log.debug("About to search for partition_finder.cfg file...")
        config_path = os.path.join(self.base_path, "partition_finder.cfg")
        util.check_file_exists(config_path)

        self._output_folders = []
        self.register_output_folders()

        self.init_logger(self.base_path)
        self.load(config_path)
Beispiel #3
0
def restore_trie_arrays(kmers_trie, keysfile, n):
    """Restore numpy arrays from str in values of kmer trie loaded from disk.

    When loaded from disk, numpy arrays in values of a previously stored trie
    are loaded as string characters. This function restores the initial arrays
    using numpy.fromstring(). To save memory, avoid building a list of keys
    of the trie and take them from file.

    Note: saving and loading tries (especially with np.array values)
    may be system-dependent (e.g., depend on 64bit or 32bit arithmetic
    and endianness), use caution when transferring files between systems.

    Args:
    kmers_trie: trie.trie object loaded from dist (e.g., using load_trie())
    keysfile: name of file where first field of each line is a k-mer, assume
              file is gzipped; loop only over the k-mers in this file
    """
    util.check_file_exists(keysfile)
    f = gzip.open(keysfile)
    for line in f:
        kmer = line.split()[0]
        index = get_num(kmer, n)
        kmer2 = kmer[n:]

        if kmers_trie[index].has_key(kmer2):
            value = kmers_trie[index][kmer2]
            if isinstance(value, basestring):
                kmers_trie[index][kmer2] = np.fromstring(value, dtype=int)
    f.close()
    return kmers_trie
Beispiel #4
0
def get_model(model_name, db, num_classes=256, batch_size=100, epochs=75, new=False):
    if new is False:
        try:
            check_file_exists(model_name)
            # print('oke')
            return load_model(model_name)
        except:
            pass

    (x_profiling, y_profiling), (x_attack, y_attack) = load_ascad(db)

    y_profiling = to_categorical(y_profiling, num_classes=num_classes, dtype='int32')
    y_attack = to_categorical(y_attack, num_classes=num_classes, dtype='int32')

    save_model = ModelCheckpoint(model_name)
    callbacks = [save_model]

    # model = spread_model(num_classes)
    # model = mlp_model(num_classes)
    model = cnn_model(num_classes)
    x_profiling = x_profiling[0:5000]
    y_profiling = y_profiling[0:5000]

    # num_traces = 500
    # x_profiling = x_profiling[:num_traces, :]
    # y_profiling = y_profiling[:num_traces, :]
    if len(model.get_layer(index=0).input_shape) == 3:
        x_profiling = x_profiling.reshape((x_profiling.shape[0], x_profiling.shape[1], 1))
    model.fit(x_profiling, y_profiling, epochs=epochs, batch_size=batch_size, callbacks=callbacks)

    return model
    def load_base_path(self, pth):
        """Load using a base path folder"""
        # Allow for user and environment variables
        pth = os.path.expanduser(pth)
        pth = os.path.expandvars(pth)
        pth = os.path.normpath(pth)

        folder, filename = self.find_config_file(pth)

        # check that user didn't enter a file instead of a folder
        # if os.path.isfile(pth):
            # log.error("The second argument of the commandline currently
            # points to a file, but it should point to the folder that contains
            # the alignment and .cfg files, please check.")
            # raise ConfigurationError

        self.set_base_path(folder)

        # From now on we refer to relative paths
        config_path = os.path.join(self.base_path, filename)
        log.debug("About to search for partition_finder.cfg file...")
        config_path = os.path.join(self.base_path, "partition_finder.cfg")
        util.check_file_exists(config_path)

        self._output_folders = []
        self.register_output_folders()

        self.init_logger(self.base_path)
        self.load(config_path)
def get_test_subtasks(target_test_name, mapping_file):
    check_file_exists(mapping_file)
    subtasks = []
    with open(mapping_file, 'r') as f:
        for line in f.readlines():
            line_subtask, line_test_name = line.split()
            if line_test_name == target_test_name:
                subtasks.append(line_subtask)
    return subtasks
 def validate(self):
     """Should be called before processing"""
     # Just path validation for now.
     util.check_folder_exists(self.base_path)
     self.alignment_path = os.path.join(self.base_path, self.alignment)
     log.info("Looking for alignment file '%s'...", self.alignment_path)
     util.check_file_exists(self.alignment_path)
     if self.user_tree is None:
         self.user_tree_topology_path = None
     else:
         self.user_tree_topology_path = os.path.join(self.base_path, self.user_tree)
         log.info("Looking for tree file '%s'...", self.user_tree_topology_path)
         util.check_file_exists(self.user_tree_topology_path)
Beispiel #8
0
 def validate(self):
     """Should be called before processing"""
     # Just path validation for now.
     util.check_folder_exists(self.base_path)
     self.alignment_path = os.path.join(self.base_path, self.alignment)
     log.info("Looking for alignment file '%s'...", self.alignment_path)
     util.check_file_exists(self.alignment_path)
     if self.user_tree is None:
         self.user_tree_topology_path = None
     else:
         self.user_tree_topology_path = \
                 os.path.join(self.base_path, self.user_tree)
         log.info("Looking for tree file '{}'...".format(
             self.user_tree_topology_path))
         util.check_file_exists(self.user_tree_topology_path)
Beispiel #9
0
 def __verify_post_backup(self, backup_dir: str):
     # Check that database backup is there
     logger.info("Verifying backup sanity from {}".format(backup_dir))
     db_backup_fn = "{path}/backup.sql".format(path=Path(backup_dir))
     if not check_file_exists(db_backup_fn,
                              container=DockerContainer.LOCAL):
         logger.error(
             "Database backup file {} not present.".format(db_backup_fn))
         raise
Beispiel #10
0
def build_kmers_tries(kmers_filename, goodkeys_filename, badkeys_filename, 
                      kmers_trie_filename, genome, altpam, pampos, maxcount, n):
    util.check_file_exists(kmers_filename)
    if goodkeys_filename:
        goodkeys = gzip.open(goodkeys_filename, 'w')
    if badkeys_filename:
        badkeys = gzip.open(badkeys_filename,'w')

    kmers_trie = trie.trie()

    f = gzip.open(kmers_filename)
    for line in f:
        kmer, coord = line.strip().split()
        kmer2 = kmer[n:]

        if kmers_trie.has_key(kmer2):
            arr = kmers_trie[kmer2]
            if len(arr) < maxcount + 1:
                coord_int = util.map_coord_to_int(coord, genome)
                arr = np.append(arr, coord_int)
                arr[0] = len(arr) - 1
                kmers_trie[kmer2] = arr
        else:
            coord_int = util.map_coord_to_int(coord, genome)
            label = 0
            if pampos == 'start' and any(kmer.startswith(p) for p in altpam):
                label = 1
            if pampos == 'end' and any(kmer.endswith(p) for p in altpam):
                label = 1
            kmers_trie[kmer2] = np.array([label, coord_int])
            
            if label == 0:
                goodkeys.write('%s\n' % kmer)
            if label != 0:
                badkeys.write('%s\n' % kmer)
    
    save_single_trie(kmers_trie, kmers_trie_filename)

    goodkeys.close()
    badkeys.close()
    f.close()
Beispiel #11
0
def sam_to_bam(samfile, bamfile, index=False):
    """Produce sorted and indexed BAM file from SAM file with guideRNAs.

    Relies on 'gzip', 'samtools' available in the system.

    Args:
    samfile: where SAM file is stored, assume file is gzipped
    bamfile: where to store BAM file
    index: if True, index the resulting BAM file
    """
    util.check_file_exists(samfile)
    # util.warn_file_exists(bamfile)
    samtools_command = 'gzip -cd %s | samtools view -hb - ' \
                       '| samtools sort - > %s' \
                       % (samfile, bamfile)
    # print samtools_command
    os.system(samtools_command)
    if index:
        samtools_index_command = 'samtools index %s' % bamfile
        print samtools_index_command
        os.system(samtools_index_command)
Beispiel #12
0
def filter_trie_mismatch(badkeysfile, index, kmers_trie, keysfile, sim, n, parts):
    
    util.check_file_exists(keysfile)

    index_seq = generate_four(index, n)
    mismatches = []
    for i in range(parts):
        index_seq1 = generate_four(i, n)
        mismatch = four_compare(index_seq, index_seq1, n)
        mismatches.append(mismatch)

    f = gzip.open(keysfile)

    badkeys = gzip.open(badkeysfile, 'w')
    
    for line in f:
        kmer = line.split()[0]
        kmer2 = kmer[n:]

        if not kmers_trie[index].has_key(kmer2):
            continue
        if kmers_trie[index][kmer2][0] != 0:
            continue 

        for i in range(parts):
            
            mismatch = mismatches[i]
            if mismatch > sim:
                continue 
            else:
                all_dist = kmers_trie[i].get_approximate_hamming(kmer2, sim-mismatch)

                if any(dist+mismatch > 0 for seq,arr,dist in all_dist):
                    badkeys.write('%s %s\n' % (index, kmer2))
                
    f.close()
    badkeys.flush()
    badkeys.close()
    return kmers_trie
def read_test_data(file):

    # assume if one is saved they all are
    if util.check_file_exists(CONST.DATASET_PATH + CONST.TEST_PATH):
        T_Data = util.load(CONST.DATASET_PATH + CONST.TEST_PATH)
        T_Labels = util.load(CONST.DATASET_PATH + CONST.TEST_PATH_LABELS)
        T_Queries = util.load(CONST.DATASET_PATH + CONST.TEST_PATH_Q)
        T_Docs = util.load(CONST.DATASET_PATH + CONST.TEST_PATH_DOCS)

    else:
        T_Data, T_Labels, T_Queries, T_Docs = read_train_data(file)

        util.save_pickle(CONST.DATASET_PATH + CONST.TEST_PATH, T_Data)
        util.save_pickle(CONST.DATASET_PATH + CONST.TEST_PATH_LABELS, T_Labels)
        util.save_pickle(CONST.DATASET_PATH + CONST.TEST_PATH_Q, T_Queries)
        util.save_pickle(CONST.DATASET_PATH + CONST.TEST_PATH_DOCS, T_Docs)

    return T_Data, T_Labels, T_Queries, T_Docs
Beispiel #14
0
    def __verify_post_backup(self, backup_dir: str):
        # Check that database backup is there
        logger.info("Verifying backup sanity from {}".format(backup_dir))
        db_backup_fn = "{path}/backup.sql".format(path=Path(backup_dir))
        if not check_file_exists(db_backup_fn, container=DockerContainer.BACKUP):
            logger.error("Database backup file {} not present.".format(db_backup_fn))
            raise

        data_backup_dir = "{path}/data".format(path=Path(backup_dir))
        if not check_dir_exist(data_backup_dir, container=DockerContainer.BACKUP):
            logger.error("Data backup directory {} non-existent.".format(data_backup_dir))
            raise

        if check_dir_empty(data_backup_dir, container=DockerContainer.BACKUP):
            logger.error("Data backup directory {} empty.".format(data_backup_dir))
            raise

        backup_size = get_folder_size_in_bytes(data_backup_dir, container=DockerContainer.BACKUP)
        if backup_size == 0:
            logger.error("Backup size: {}.".format(backup_size))
            raise

        backup_size_str = get_folder_size_human(data_backup_dir, container=DockerContainer.BACKUP)
        logger.info("Backup successful verified in {}".format(backup_size_str))
Beispiel #15
0
def do_pandoc_generation(notes_folder: str, temp_folder: str, html_folder: str) -> None:
    logger: Logger = get_logger()

    for folder in [notes_folder, temp_folder, html_folder]:
        logger.info('creating folder: \'%s\' if it doesn\'t exist already', folder)
        util.create_folder(folder)

    # only queue up files for pandoc generation if they (or the files that
    # point to them) have been modified recently, so that we don't have to
    # regenerate everything each time we make one change in one file.
    state_file: dict = util.read_existing_json_state_file(location=temp_folder)
    relevant_file_names: Set[str] = set()
    for file_name in os.listdir(notes_folder):
        if not util.is_md(file_name):
            continue
        key: str = util.strip_file_extension(file_name)
        if state_file['files'][key]['last_checked'] == state_file['runtime']:
            relevant_file_names.add(file_name)
            # ensure that we also refresh the backlinks for the files that are
            # referenced by this file (since the links go two ways)
            with open(util.path(notes_folder, file_name), 'r') as f:
                contents = f.read()
                # the results of re.findall() will look something like
                # [('Page B', 'pageB.md')]
                # where the link in markdown would've been [Page B](pageB.md)
                for _, link in util.md_links.findall(contents):
                    if util.is_md(link):
                        relevant_file_names.add(link)

    for file in relevant_file_names:
        # the path to the note is always gonna be in the notes_folder
        file_full_path: str = util.path(notes_folder, file)
        note_title = util.note_title(file_full_path)

        # the output HTML file should have the same name as the note but with
        # the .html suffix and it should be in the html folder
        file_html: str = util.path(html_folder, file)
        file_html: str = util.change_file_extension(file_html, '.html')

        # the backlinks file should have the same name as the note but with
        # the .md.backlinks suffix, and it should be in the temp folder
        file_backlinks: str = util.path(temp_folder, file + '.backlinks')

        logger.info('converting %s to html, title=%s', file, note_title)
        util.do_run(cmd=[
            'pandoc',
            file_full_path, file_backlinks,
            f'--defaults=pandoc.yaml',
            f'--id-prefix={util.to_footnote_id(file)}',
            f'--output={file_html}',
            f'--metadata=pagetitle:{note_title}'
        ])

    # if the index.md was generated in the temp folder, pandocify it
    index_file_name = 'index.md'
    generated_index_file = util.path(temp_folder, index_file_name)
    if util.check_file_exists(generated_index_file):
        output_file = util.path(
            html_folder, util.change_file_extension(index_file_name, '.html'))
        index_title = util.note_title(generated_index_file)
        logger.debug('converting %s to html, title=%s', generated_index_file, index_title)
        util.do_run(cmd=[
            'pandoc',
            generated_index_file,
            f'--defaults=pandoc.yaml',
            f'--id-prefix={util.to_footnote_id(index_file_name)}',
            f'--output={output_file}',
            f'--metadata=pagetitle:{index_title}'
        ])
Beispiel #16
0
                         False, False):
    cleanup(-1, logger)

if not util.check_folder(
        os.path.join(util.get_script_path(), "data", args.ide), logger, False,
        False):
    cleanup(-1, logger)

if not util.check_folder(
        os.path.join(util.get_script_path(), "data", args.ide, "debian"),
        logger, False, False):
    cleanup(-1, logger)

# Checking files
for file in ["control.in", "postinst", "sysctl-99.conf"]:
    if not util.check_file_exists(os.path.join(util.get_script_path(), "data", args.ide, "debian", file)) and not \
            util.check_file_readable(os.path.join(util.get_script_path(), "data", args.ide, "debian", file)):
        logger.error("%s does not exist or is not readable." % file)
        cleanup(-1, logger)

for file in [
        "LICENSE", "Makefile", "pkginfo.in", "prototype.in", "icon.desktop",
        "start.sh", "vmoptions.README"
]:
    if not util.check_file_exists(os.path.join(util.get_script_path(), "data", args.ide, file)) and not \
            util.check_file_readable(os.path.join(util.get_script_path(), "data", args.ide, file)):
        logger.error("%s does not exist or is not readable." % file)
        cleanup(-1, logger)

# Download URL
if util.check_file_exists(
if __name__ == '__main__':

    if len(sys.argv) != 3:
        from util import simple_usage_message
        simple_usage_message("<tests-dir> <gen-summary-file>")

    tests_dir = sys.argv[1]
    gen_summary_file = sys.argv[2]

    if not os.path.isdir(tests_dir):
        sys.stderr.write(
            "The tests directory not found or not a valid directory: {}.\n".
            format(tests_dir))
        exit(4)
    check_file_exists(gen_summary_file,
                      "The tests are not correctly generated.\n")

    with open(gen_summary_file) as gsf:
        test_name_list = [
            line.split()[0] for line in map(str.strip, gsf.readlines())
            if line and not line.startswith("#")
        ]

    if SPECIFIC_TESTS == "true":
        check_test_pattern_exists_in_list(test_name_list,
                                          SPECIFIED_TESTS_PATTERN)
        test_name_list = filter(
            lambda test_name: test_name_matches_pattern(
                test_name, SPECIFIED_TESTS_PATTERN), test_name_list)

    missing_tests = []
Beispiel #18
0
def trie_to_sam(index, kmers_trie, keysfile, samfile, args, offdist, maxoffcount,
                process, n, parts):
    """Produce SAM file with guideRNAs and info about their off-targets.

    Convention: Off-target info is stored in the optional field with flag 'of'
    of type 'H' (hex byte array) produced by function
    util.offtargetinfo_to_hex() and can be restored using function
    util.hex_to_offtargetinfo(). Also store there optional fields 'od'
    and 'oc' of type 'i' (integer) indicating parameters offdist
    and maxoffcount, respectively, for which this off-target info was
    produced. This is needed if potentially different values of offdist
    and maxoffcount are used within the same SAM (i.e., for some
    guideRNAs one wants more detailed info about their off-targets
    than for others). 'od' and 'oc' are relevant even if 'of' is empty.
    If 'od' and 'oc' are not provided, refer to info from SAM header.


    Note: SAM uses 1-based genomic coordinates, and the SAM file produced
    by this function is consistent with that; 0-based coordinates of
    guideRNAs stored in the trie are transformed into 1-based coordinates.
    However, coordinates of off-targets stored in 'of' field remain
    0-based.

    Args:
    index: the index of keysfile
    kmers_trie: trie.trie object with all guideRNAs as produced by
                guides.analyze_guides()
    keysfile: name of file with all k-mers that are considered good
              candidate guideRNAs, one per line; if ends with '.gz'
              assume file is gzipped
    samfile: where to store SAM file, will be gzipped
    args: arguments of the project, used to print some info in SAM header
    offdist: maximum Hamming distance to consider from guideRNA to its
             off-target; use -1 for omitting any off-target info
             in resulting BAM (works much faster);
             use this value instead of what args contains
    maxoffcount: store at most this many off-targets for a guideRNA;
                 ignore if offdist is -1;
                 use this value instead of what args contains
    process: process number; to distinguish in output from different processes
    n: the number of prefix of kmers for preprocessing
    parts: the number of parts of classified kmers
    """
    util.print_log('process%s:trie_to_sam' % process)
    
    util.check_file_exists(keysfile)
    f = gzip.open(keysfile) if keysfile.endswith('.gz') else open(keysfile)
    s = gzip.open(samfile, 'w')
    s.write('@HD\tVN:1.0\tSO:unknown\n')
    genome = args['genome']
    # be careful with changing the next line
    # current rational is: arrays contain only global genomic coordinates
    # (and label as 0-th element); delim avoids all of these
    delim = util.get_nonexist_int_coord(genome)
    for chrom, length in genome:
        s.write('@SQ\tSN:%s\tLN:%s\n' % (chrom, length))
    s.write('@CO\tprepared with iGuide software\n')
    s.write('@CO\tcontains info about guideRNAs and their off-targets\n')
    s.write('@CO\targuments of the run:\n')
    s.write('@CO\t%s\n' % args)
    count = 0
    starttime = datetime.now()
    lasttime = starttime

    index_seq = guides.generate_four(index, n)
    mismatches = []
    
    if offdist != -1:
        for i in range(parts):
            index_seq1 = guides.generate_four(i, n)
            mismatch = guides.four_compare(index_seq, index_seq1, n)
            mismatches.append(mismatch)
    # util.print_log('process%s:mismatch done...' % process)
    for line in f:
        
        guide = line.split()[0]
        kmer2 = guide[n:]
        
        count += 1
        
        # QNAME
        samline = guide
#        samline = '%st%s' % (count, process)
        if not kmers_trie[index].has_key(kmer2):
            print 'process %s warning: %s is not in trie %s, skip' \
                  % (process, kmer2, index)
            continue
        arr = kmers_trie[index][kmer2]
        if arr[0] != 0:
            print 'process %s warning: %s is not a good guideRNA according' \
                  ' to label in trie %s, skip' % (process, kmer2, index)
            continue
        if len(arr) > 2:
            print 'process %s warning: %s is stored with more than one' \
                  ' coordinate in trie %s, skip' % (process, kmer2, index)
            continue
        coord = arr[1]
        coord = util.map_int_to_coord(coord, genome)
        chrom, pos, strand = coord.split(':')
        pos = int(pos)
        flag = '0' if strand == '+' else '16'
        # FLAG
        samline += '\t%s' % flag
        # RNAME
        samline += '\t%s' % chrom
        if strand == '-':
            pos = pos - args['length'] - len(args['pam']) + 1
        pos += 1  # SAM uses 1-based coordinates, in our code we used 0-based
        # POS
        samline += '\t%s' % pos
        # MAPQ
        samline += '\t100'  # 100 is arbitrary choice, 255 is not recommended
        # CIGAR
        samline += '\t%sM' % (args['length'] + len(args['pam']))
        # RNEXT
        samline += '\t*'
        # PNEXT
        samline += '\t0'
        # TLEN
        samline += '\t0'
        # SEQ
        seq = guide
        if strand == '-':
            seq = str(Seq.Seq(guide).reverse_complement())
        samline += '\t%s' % seq
        # QUAL
        samline += '\t*'
        # offtargets
        offtargetargs = 'od:i:%s\toc:i:%s' % (offdist, maxoffcount)
        samline += '\t%s' % offtargetargs
        if offdist != -1:

            offtargetinfo = get_offtarget_info(mismatches, kmers_trie, kmer2, offdist,
                                               maxoffcount, delim, parts)
            if offtargetinfo:
                offtargetstr = 'of:H:%s' % offtargetinfo
                samline += '\t%s' % offtargetstr
        samline += '\n'
        s.write(samline)
        currenttime = datetime.now()
        # print every minute during first 10 minutes, then every 20 minutes
        if (currenttime - lasttime).seconds > 1200 \
           or ((currenttime - lasttime).seconds > 60
               and (lasttime - starttime).seconds < 600):
            util.print_log('process %s: %s guides processed' % (process, count))
            lasttime = currenttime
    util.print_log('process %s: total %s guides processed' % (process, count))
    s.flush()
    s.close()
    f.close()
    if not util.create_folder(folder):
        logger.error("%s can not be created." % folder)
        sys.exit(-1)

if not util.check_folder(os.path.join(util.get_script_path(), "data"), logger, False, False):
    cleanup(-1, logger)

if not util.check_folder(os.path.join(util.get_script_path(), "data", args.ide), logger, False, False):
    cleanup(-1, logger)

if not util.check_folder(os.path.join(util.get_script_path(), "data", args.ide, "debian"), logger, False, False):
    cleanup(-1, logger)

# Checking files
for file in ["control.in", "postinst", "sysctl-99.conf"]:
    if not util.check_file_exists(os.path.join(util.get_script_path(), "data", args.ide, "debian", file)) and not \
            util.check_file_readable(os.path.join(util.get_script_path(), "data", args.ide, "debian", file)):
        logger.error("%s does not exist or is not readable." % file)
        cleanup(-1, logger)

for file in ["LICENSE", "Makefile", "pkginfo.in", "prototype.in", "icon.desktop", "start.sh", "vmoptions.README"]:
    if not util.check_file_exists(os.path.join(util.get_script_path(), "data", args.ide, file)) and not \
            util.check_file_readable(os.path.join(util.get_script_path(), "data", args.ide, file)):
        logger.error("%s does not exist or is not readable." % file)
        cleanup(-1, logger)

# Download URL
if util.check_file_exists(os.path.join(util.get_script_path(), "tmp", link.split("/")[-1])):
    if not util.delete_file(os.path.join(util.get_script_path(), "tmp", link.split("/")[-1]), logger, False):
        cleanup(-1, logger)
Beispiel #20
0
def build_kmers_trie(filename, genome, name, altpam=[], pampos='end', maxcount=10,
                     goodkeysfile='', badkeysfile='', tempdir='', triekeys_v1_filenames=[],
                      kmers_filenames=[], processes=1, n=4, parts=256):
    """Read k-mers and their coordinates from file and store them in a trie.

    The resulting trie is of the form {<k-mer> : <np.array of int values>}.
    In each array, store int-transformed coordinates of occurrences of
    the k-mer, starting from position 1 of the array. Store at most
    'maxcount' coordinates.
    Position 0 in the array is reserved for labeling:
    0: good guideRNA,
    positive: show how many occurrences this k-mer has in the genome
    [other labels: decide later]
    In this building stage, label all k-mers with alternative PAM
    and all k-mers with more than one occurrence in the genome
    as bad guideRNAs.
    Optionally, also store in a separate file all k-mers that are still
    considered good candidate guideRNAs. This means only filtering out
    k-mers with alternative PAM, because detecting multi-mapping k-mers
    labeling them as bad guideRNAs may happen after they were first read.

    Note: make sure lines with k-mers in input file are randomly shuffled.
    This is to ensure that for k-mers with more than 'maxcount' occurrences,
    we store artibrary 'maxcount' of them without any bias.

    Args:
    filename: name of file with k-mers, assume file is gzipped and
              lines are randomly shuffled
    genome: list of pairs [(<chromosome name>, <chromosome length>)]
    altpam: list of alternative PAM sequences, all k-mers starting
            or ending (depending on argument 'pampos') with these
            sequences are labeled as bad guideRNAs
    pampos: position of alternative PAM in k-mer ('start' or 'end')
    maxcount: store at most this many coordinates for each k-mer
    goodkeysfile: where to store potentially good candidate guideRNAs;
                  use only if altpam is not empty, otherwise all input
                  keys from filename will be stored which is redundant

    Output:
    return trie.trie object {<k-mer> : <np.array of int values>}
    optionally produce file goodkeysfile with candidate guideRNAs
    """
    # parts = 256
    util.check_file_exists(filename)   

    badkeysfiles = ['%s/badkeys%s.txt.gz' % (tempdir, i) for i in range(parts)]

    kmers_trie_files = ['%s/kmers_trie%s.dat' % (tempdir, i) for i in range(parts)]

    util.print_log('classify k-mers into %s...' % parts)
    if parts > 1000:
        tempfiles = [tempfile.NamedTemporaryFile(dir=tempdir,
                                               suffix='.temp%s' % i)
                        for i in range(2)]
        # tempfiles = [gzip.open('%s/temp%s.txt.gz' % (tempdir, i),'w') for i in range(2)]
        mid = parts // 2
        file = gzip.open(filename)
        for line in file:
            kmer = line.split()[0]
            index = get_num(kmer, n)
            if index < mid:
                tempfiles[0].write(str(index) + ' ' + line)
            else:
                tempfiles[1].write(str(index) + ' ' + line)
        for f in tempfiles:
            f.flush()
        file.close()
        util.print_log('write...')

        kmersfiles1 = [gzip.open(kmers_filenames[i],'w') for i in range(mid)]
        # tempfiles = [gzip.open('%s/temp%s.txt.gz' % (tempdir, i)) for i in range(2)]
        temp = [open(tempfiles[i].name) for i in range(2)]
        for line in temp[0]:
            data = line.split()
            index = int(data[0])
            kmer = data[1]
            coord = data[2]
            kmersfiles1[index].write(kmer + ' ' + coord + '\n')
        for f in kmersfiles1:
            f.close()
        temp[0].close()
        # util.print_log('write count1...')

        kmersfiles2 = [gzip.open(kmers_filenames[i],'w') for i in range(mid, parts)]
        for line in temp[1]:
            data = line.split()
            index = int(data[0]) - mid
            kmer = data[1]
            coord = data[2]
            kmersfiles2[index].write(kmer + ' ' + coord + '\n')
        for f in kmersfiles2:
            f.close()
        temp[1].close()
        for f in tempfiles:
            f.close()       
        # util.print_log('write count2...')
    else:
        kmersfiles = [gzip.open(kmers_filenames[i],'w') for i in range(parts)]
        file = gzip.open(filename)
        
        for line in file:
            kmer = line.split()[0]
            index = get_num(kmer, n)
            kmersfiles[index].write(line) 
        file.close()
        for f in kmersfiles:
            f.close()

    util.print_log('done...')
    util.print_log('build tries start...')
    
    process_list = []
    all_task = Queue()
    for i in range(parts):
        task = (kmers_filenames[i], triekeys_v1_filenames[i], badkeysfiles[i], kmers_trie_files[i])
        all_task.put(task)

    for process in range(processes):
        p = Process(target=process_pool_build_tries, args=(all_task, genome, altpam, pampos, maxcount, n))
        p.start()
        process_list.append(p)

    for p in process_list:
        p.join()

    util.print_log('build tries done...')