def parser(self, log_file):
     """
     @summary : Parse the command results to add information in log_file.
     @log_file : [str] Path to the sample process log file.
     """
     # Parse output
     FH_output_seq = SequenceFileReader.factory(self.output_seq)
     nb_cleaned = 0
     for record in FH_output_seq:
         nb_cleaned += 1
     FH_output_seq.close()
     # Write result
     FH_log = Logger(log_file)
     FH_log.write('Results:\n')
     FH_log.write("\tnb seq with 3' primer : " + str(nb_cleaned) + '\n')
     FH_log.close()
 def parser(self, log_file):
     """
     @summary : Parse the command results to add information in log_file.
     @log_file : [str] Path to the sample process log file.
     """
     # Parse output
     FH_output_seq = SequenceFileReader.factory( self.output_seq )
     nb_cleaned = 0
     for record in FH_output_seq:
         nb_cleaned += 1
     FH_output_seq.close()
     # Write result
     FH_log = Logger( log_file )
     FH_log.write( 'Results:\n' )
     FH_log.write( "\tnb seq with 3' primer : " + str(nb_cleaned) + '\n' )
     FH_log.close()
def get_seq_length(input_file, size_separator=None):
    """
    @summary: Returns the number of sequences by sequences lengths.
    @param input_file: [str] The sequence file path.
    @param size_separator: [str] If it exists the size separator in sequence ID.
    @return: [dict] By sequences lengths the number of sequence.
    """
    nb_by_length = dict()
    FH_seq = SequenceFileReader.factory(input_file)
    for record in FH_seq:
        nb_seq = 1
        if size_separator is not None:
            nb_seq = int(record.id.rsplit(size_separator, 1)[-1])
        seq_length = len(record.string)
        if not nb_by_length.has_key(str(seq_length)):
            nb_by_length[str(seq_length)] = 0
        nb_by_length[str(seq_length)] += nb_seq
    FH_seq.close()
    return nb_by_length
def get_seq_length( input_file, size_separator=None ):
    """
    @summary: Returns the number of sequences by sequences lengths.
    @param input_file: [str] The sequence file path.
    @param size_separator: [str] If it exists the size separator in sequence ID.
    @return: [dict] By sequences lengths the number of sequence.
    """
    nb_by_length = dict()
    FH_seq = SequenceFileReader.factory( input_file )
    for record in FH_seq:
        nb_seq = 1
        if size_separator is not None:
            nb_seq = int(record.id.rsplit(size_separator, 1)[-1])
        seq_length = len(record.string)
        if not nb_by_length.has_key(str(seq_length)):
            nb_by_length[str(seq_length)] = 0
        nb_by_length[str(seq_length)] += nb_seq
    FH_seq.close()
    return nb_by_length
Exemple #5
0
    # Get observation sequences
    observation_id_by_seq = dict()
    FH_seeds = FastaIO(args.seeds_fasta)
    for record in FH_seeds:
        if record.string in observation_id_by_seq:
            raise Exception("The OTU '" +
                            observation_id_by_seq[record.string] + "' and '" +
                            record.id + "' have the same sequence.")
        observation_id_by_seq[record.string] = record.id.split(";size=")[0]
    FH_seeds.close()

    # Get centroids of observation
    reference_by_observation_id = dict()
    for file in args.reads:
        FH_reads = SequenceFileReader.factory(file)
        for record in FH_reads:
            if record.string in observation_id_by_seq:
                observation_id = observation_id_by_seq[record.string]
                reference_id = re.search("reference=([^\s]+)",
                                         record.description).group(1)
                if observation_id not in reference_by_observation_id:
                    reference_by_observation_id[observation_id] = reference_id
                elif len(reference_by_observation_id[observation_id].split(
                        ",")) > len(reference_id.split(",")):
                    reference_by_observation_id[observation_id] = reference_id
        FH_reads.close()
    if len(observation_id_by_seq) != len(reference_by_observation_id):
        missing = list()
        for seed_seq in observation_id_by_seq:
            if observation_id_by_seq[
    # Get observation sequences
    nb_observations = 0
    observation_ids_by_seq = dict()
    FH_seeds = FastaIO(args.input)
    for record in FH_seeds:
        nb_observations += 1
        if record.string not in observation_ids_by_seq:
            observation_ids_by_seq[record.string] = list()
        observation_ids_by_seq[record.string].append(record.id)
    FH_seeds.close()

    # Get centroids (the real centroid and indentical sequences) ID by observation
    observation_ids_by_centroid_id = dict()
    for file in args.trimmed_reads:
        FH_reads = SequenceFileReader.factory(file)
        for record in FH_reads:
            record_seq = record.string.replace("-", "").replace(".", "")
            if record_seq in observation_ids_by_seq:
                observation_ids_by_centroid_id[record.id] = observation_ids_by_seq[record_seq]
        FH_reads.close()

    # Get reference by observation
    reference_by_observation_id = dict()
    for file in args.reads:
        FH_reads = SequenceFileReader.factory(file)
        for record in FH_reads:
            if record.id in observation_ids_by_centroid_id:
                observation_ids = observation_ids_by_centroid_id[record.id]
                reference_id = re.search("reference=([^\s]+)", record.description).group(1)
                for current_obs_id in observation_ids: