Ejemplo n.º 1
0
    def test_getCoordKey(self):
        result = [getCoordKey(x, coord_type='illumina') for x in self.desc_illumina]
        self.assertListEqual(result, self.id_illumina)
        print(result)

        result = [getCoordKey(x, coord_type='sra') for x in self.desc_sra]
        self.assertListEqual(result, self.id_sra)
        print(result)

        result = [getCoordKey(x, coord_type='454') for x in self.desc_454]
        self.assertListEqual(result, self.id_454)
        print(result)
 def _read_pairs(seq_file_1, seq_file_2):
     iter_1 = readSeqFile(seq_file_1, index=False)
     iter_2 = readSeqFile(seq_file_2, index=False)
     for seq_1, seq_2 in zip(iter_1, iter_2):
         key_1 = getCoordKey(seq_1.description, coord_type=coord_type,
                             delimiter=delimiter)
         key_2 = getCoordKey(seq_2.description, coord_type=coord_type,
                             delimiter=delimiter)
         if key_1 == key_2:
             yield (key_1, [seq_1, seq_2])
         else:
             raise Exception('Coordinates for sequences %s and %s do not match' \
                              % (key_1, key_2))
Ejemplo n.º 3
0
 def _read_pairs(seq_file_1, seq_file_2):
     iter_1 = readSeqFile(seq_file_1, index=False)
     iter_2 = readSeqFile(seq_file_2, index=False)
     for seq_1, seq_2 in zip(iter_1, iter_2):
         key_1 = getCoordKey(seq_1.description,
                             coord_type=coord_type,
                             delimiter=delimiter)
         key_2 = getCoordKey(seq_2.description,
                             coord_type=coord_type,
                             delimiter=delimiter)
         if key_1 == key_2:
             yield (key_1, [seq_1, seq_2])
         else:
             raise Exception('Coordinates for sequences %s and %s do not match' \
                              % (key_1, key_2))
Ejemplo n.º 4
0
 def _read_pairs(seq_file_1, seq_file_2):
     iter_1 = readSeqFile(seq_file_1, index=False)
     iter_2 = readSeqFile(seq_file_2, index=False)
     for seq_1, seq_2 in zip(iter_1, iter_2):
         key_1 = getCoordKey(seq_1.description, coord_type=coord_type,
                             delimiter=delimiter)
         key_2 = getCoordKey(seq_2.description, coord_type=coord_type,
                             delimiter=delimiter)
         if key_1 == key_2 or coord_type == 'r1':
             yield (key_1, [seq_1, seq_2])
         else:
             import pdb; pdb.set_trace()
             pass; # args.debug=False
             raise Exception('Coordinates for sequences %s and %s do not match' \
                              % (key_1, key_2))
Ejemplo n.º 5
0
 def _key_func(x):
     return getCoordKey(x, coord_type=coord_type, delimiter=out_args['delimiter'])
Ejemplo n.º 6
0
def pairSeq(seq_file_1, seq_file_2, fields_1=None, fields_2=None,
            coord_type=default_coord_type,
            out_args=default_out_args):
    """
    Generates consensus sequences

    Arguments: 
    seq_file_1 = the file containing the grouped sequences and annotations
    seq_file_2 = the file to assign annotations to from seq_file_1
    fields_1 = list of annotations in seq_file_1 records to copy to seq_file_2 records;
               if None do not copy any annotations
    fields_2 = list of annotations in seq_file_2 records to copy to seq_file_1 records;
               if None do not copy any annotations
    coord_type = the sequence header format
    out_args = common output argument dictionary from parseCommonArgs
                    
    Returns: 
    a list of tuples holding successfully paired filenames for (seq_file_1, seq_file_2)
    """
    # Define private functions
    def _key_func(x):
        return getCoordKey(x, coord_type=coord_type, delimiter=out_args['delimiter'])

    log = OrderedDict()
    log['START'] = 'PairSeq'
    log['FILE1'] = os.path.basename(seq_file_1)
    log['FILE2'] = os.path.basename(seq_file_2)
    log['FIELDS_1'] = ','.join(fields_1) if fields_1 is not None else None
    log['FIELDS_2'] = ','.join(fields_2) if fields_2 is not None else None
    log['COORD_TYPE'] = coord_type
    printLog(log)

    # Define output type
    if out_args['out_type'] is None:
        out_type_1 = getFileType(seq_file_1)
        out_type_2 = getFileType(seq_file_2)
    else: 
        out_type_1 = out_type_2 = out_args['out_type']

    # Define output name
    if out_args['out_name'] is None:
        out_name_1 = out_name_2 = None
    else: 
        out_name_1 = '%s-1' % out_args['out_name']
        out_name_2 = '%s-2' % out_args['out_name']

    # Open and count files
    start_time = time()
    printMessage("Indexing files", start_time=start_time)
    # Index file 1
    seq_count_1 = countSeqFile(seq_file_1)
    seq_dict_1 = readSeqFile(seq_file_1, index=True, key_func=_key_func)
    # Define file 2 iterator
    seq_count_2 = countSeqFile(seq_file_2)
    seq_iter_2 = readSeqFile(seq_file_2, index=False)
    printMessage("Done", start_time=start_time, end=True)

    # Open output file handles
    pass_handle_1 = getOutputHandle(seq_file_1, 'pair-pass', out_args['out_dir'], 
                                    out_name=out_name_1, out_type=out_type_1)
    pass_handle_2 = getOutputHandle(seq_file_2, 'pair-pass', out_args['out_dir'], 
                                    out_name=out_name_2, out_type=out_type_2)

    if out_args['failed']:
        fail_handle_1 = getOutputHandle(seq_file_1, 'pair-fail', out_dir=out_args['out_dir'],
                                        out_name=out_name_1, out_type=out_type_1)
        fail_handle_2 = getOutputHandle(seq_file_2, 'pair-fail', out_dir=out_args['out_dir'],
                                        out_name=out_name_2, out_type=out_type_2)
        pass_keys = list()

    # Iterate over pairs and write to output files
    start_time = time()
    rec_count = pair_count = 0
    for seq_2 in seq_iter_2:
        # Print progress for previous iteration
        printProgress(rec_count, seq_count_2, 0.05, start_time)
        rec_count += 1

        # Check for file 2 mate pair in file 1
        coord_2 = getCoordKey(seq_2.id, coord_type=coord_type,
                              delimiter=out_args['delimiter'])
        seq_1 = seq_dict_1.get(coord_2, None)

        if seq_1 is not None:
            # Record paired keys
            pair_count += 1

            if fields_1 is not None or fields_2 is not None:
                ann_1 = parseAnnotation(seq_1.description,
                                        delimiter=out_args['delimiter'])
                ann_2 = parseAnnotation(seq_2.description,
                                        delimiter=out_args['delimiter'])

                # Prepend annotations from seq_1 to seq_2
                if fields_1 is not None:
                    copy_ann = OrderedDict([(k, v) for k, v in ann_1.items() \
                                            if k in fields_1])
                    merge_ann = mergeAnnotation(ann_2, copy_ann, prepend=True,
                                                delimiter=out_args['delimiter'])
                    seq_2.id = flattenAnnotation(merge_ann,
                                                 delimiter=out_args['delimiter'])
                    seq_2.description = ''

                # Append annotations from seq_2 to seq_1
                if fields_2 is not None:
                    copy_ann = OrderedDict([(k, v) for k, v in ann_2.items() \
                                            if k in fields_2])
                    merge_ann = mergeAnnotation(ann_1, copy_ann, prepend=False,
                                                delimiter=out_args['delimiter'])
                    seq_1.id = flattenAnnotation(merge_ann,
                                                 delimiter=out_args['delimiter'])
                    seq_1.description = ''

            # Write paired records
            SeqIO.write(seq_1, pass_handle_1, out_type_1)
            SeqIO.write(seq_2, pass_handle_2, out_type_2)

        # Write unpaired file 2 records and updated paired key list for finding unpaired file 1 records
        if out_args['failed']:
            if seq_1 is not None:  pass_keys.append(coord_2)
            else:  SeqIO.write(seq_2, fail_handle_2, out_type_2)

    # Print final progress
    printProgress(rec_count, seq_count_2, 0.05, start_time)

    # Find and write unpaired file 1 records
    if out_args['failed']:
        start_time = time()
        printMessage("Finding unpaired", start_time=start_time)

        # Find file 1 unpaired keys
        pass_keys = set(pass_keys)
        unpaired = set(seq_dict_1).difference(pass_keys)
        # Write unpaired file 1 records
        for k in unpaired:  SeqIO.write(seq_dict_1[k], fail_handle_1, out_type_1)

        printMessage("Done", start_time=start_time, end=True)

    # Print log
    log = OrderedDict()
    log['OUTPUT1'] = os.path.basename(pass_handle_1.name)
    log['OUTPUT2'] = os.path.basename(pass_handle_2.name)
    log['SEQUENCES1'] = seq_count_1
    log['SEQUENCES2'] = seq_count_2
    log['PASS'] = pair_count
    log['END'] = 'PairSeq'
    printLog(log)
   
    # Close file handles
    pass_handle_1.close()
    pass_handle_2.close()

    return [(pass_handle_1.name, pass_handle_2.name)]
Ejemplo n.º 7
0
 def _key_func(x):
     return getCoordKey(x,
                        coord_type=coord_type,
                        delimiter=out_args['delimiter'])
Ejemplo n.º 8
0
 def _key_func(x):
     return getCoordKey(x, coord_type=coord_type, delimiter=delimiter)
 def _key_func(x):
     return getCoordKey(x, coord_type=coord_type, delimiter=delimiter)
Ejemplo n.º 10
0
def pairSeq(seq_file_1,
            seq_file_2,
            fields_1=None,
            fields_2=None,
            action=None,
            coord_type=default_coord,
            out_args=default_out_args):
    """
    Syncronized paired end files and copies annotations between them

    Arguments: 
      seq_file_1 : the file containing the grouped sequences and annotations.
      seq_file_2 : the file to assign annotations to from seq_file_1.
      fields_1 : list of annotations in seq_file_1 records to copy to seq_file_2 records;
                 if None do not copy any annotations.
      fields_2 : list of annotations in seq_file_2 records to copy to seq_file_1 records;
                 if None do not copy any annotations.
      action : the collapse action to take on all copied annotation if they already exist in the
               target header.
      coord_type : the sequence header format.
      out_args : common output argument dictionary from parseCommonArgs.
                    
    Returns: 
      list: a list of tuples holding successfully paired filenames for (seq_file_1, seq_file_2).
    """

    # Define private functions
    def _key_func(x):
        return getCoordKey(x,
                           coord_type=coord_type,
                           delimiter=out_args['delimiter'])

    log = OrderedDict()
    log['START'] = 'PairSeq'
    log['FILE1'] = os.path.basename(seq_file_1)
    log['FILE2'] = os.path.basename(seq_file_2)
    log['FIELDS_1'] = ','.join(fields_1) if fields_1 is not None else None
    log['FIELDS_2'] = ','.join(fields_2) if fields_2 is not None else None
    log['COORD_TYPE'] = coord_type
    printLog(log)

    # Define output type
    if out_args['out_type'] is None:
        out_type_1 = getFileType(seq_file_1)
        out_type_2 = getFileType(seq_file_2)
    else:
        out_type_1 = out_type_2 = out_args['out_type']

    # Define output name
    if out_args['out_name'] is None:
        out_name_1 = out_name_2 = None
    else:
        out_name_1 = '%s-1' % out_args['out_name']
        out_name_2 = '%s-2' % out_args['out_name']

    # Open and count files
    start_time = time()
    printMessage("Indexing files", start_time=start_time)
    # Index file 1
    seq_count_1 = countSeqFile(seq_file_1)
    seq_dict_1 = readSeqFile(seq_file_1, index=True, key_func=_key_func)
    # Define file 2 iterator
    seq_count_2 = countSeqFile(seq_file_2)
    seq_iter_2 = readSeqFile(seq_file_2, index=False)
    printMessage("Done", start_time=start_time, end=True)

    # Open output file handles
    pass_handle_1 = getOutputHandle(seq_file_1,
                                    'pair-pass',
                                    out_args['out_dir'],
                                    out_name=out_name_1,
                                    out_type=out_type_1)
    pass_handle_2 = getOutputHandle(seq_file_2,
                                    'pair-pass',
                                    out_args['out_dir'],
                                    out_name=out_name_2,
                                    out_type=out_type_2)

    if out_args['failed']:
        fail_handle_1 = getOutputHandle(seq_file_1,
                                        'pair-fail',
                                        out_dir=out_args['out_dir'],
                                        out_name=out_name_1,
                                        out_type=out_type_1)
        fail_handle_2 = getOutputHandle(seq_file_2,
                                        'pair-fail',
                                        out_dir=out_args['out_dir'],
                                        out_name=out_name_2,
                                        out_type=out_type_2)
        pass_keys = list()

    # Iterate over pairs and write to output files
    start_time = time()
    rec_count = pair_count = 0
    for seq_2 in seq_iter_2:
        # Print progress for previous iteration
        printProgress(rec_count, seq_count_2, 0.05, start_time=start_time)
        rec_count += 1

        # Check for file 2 mate pair in file 1
        coord_2 = getCoordKey(seq_2.id,
                              coord_type=coord_type,
                              delimiter=out_args['delimiter'])
        seq_1 = seq_dict_1.get(coord_2, None)

        if seq_1 is not None:
            # Record paired keys
            pair_count += 1

            if fields_1 is not None or fields_2 is not None:
                ann_1 = parseAnnotation(seq_1.description,
                                        delimiter=out_args['delimiter'])
                ann_2 = parseAnnotation(seq_2.description,
                                        delimiter=out_args['delimiter'])

                # Prepend annotations from seq_1 to seq_2
                if fields_1 is not None:
                    copy_ann = OrderedDict([(k, v) for k, v in ann_1.items() \
                                            if k in fields_1])
                    merge_ann = mergeAnnotation(
                        ann_2,
                        copy_ann,
                        prepend=True,
                        delimiter=out_args['delimiter'])
                    # Collapse if necessary
                    if action is not None:
                        merge_ann = collapseAnnotation(
                            merge_ann,
                            action,
                            fields=fields_1,
                            delimiter=out_args['delimiter'])
                    # Flatten
                    seq_2.id = flattenAnnotation(
                        merge_ann, delimiter=out_args['delimiter'])
                    seq_2.description = ''

                # Append annotations from seq_2 to seq_1
                if fields_2 is not None:
                    copy_ann = OrderedDict([(k, v) for k, v in ann_2.items() \
                                            if k in fields_2])
                    merge_ann = mergeAnnotation(
                        ann_1,
                        copy_ann,
                        prepend=False,
                        delimiter=out_args['delimiter'])
                    # Collapse if necessary
                    if action is not None:
                        merge_ann = collapseAnnotation(
                            merge_ann,
                            action,
                            fields=fields_2,
                            delimiter=out_args['delimiter'])
                    # Flatten
                    seq_1.id = flattenAnnotation(
                        merge_ann, delimiter=out_args['delimiter'])
                    seq_1.description = ''

            # Write paired records
            SeqIO.write(seq_1, pass_handle_1, out_type_1)
            SeqIO.write(seq_2, pass_handle_2, out_type_2)

        # Write unpaired file 2 records and updated paired key list for finding unpaired file 1 records
        if out_args['failed']:
            if seq_1 is not None: pass_keys.append(coord_2)
            else: SeqIO.write(seq_2, fail_handle_2, out_type_2)

    # Print final progress
    printProgress(rec_count, seq_count_2, 0.05, start_time=start_time)

    # Find and write unpaired file 1 records
    if out_args['failed']:
        start_time = time()
        printMessage("Finding unpaired", start_time=start_time)

        # Find file 1 unpaired keys
        pass_keys = set(pass_keys)
        unpaired = set(seq_dict_1).difference(pass_keys)
        # Write unpaired file 1 records
        for k in unpaired:
            SeqIO.write(seq_dict_1[k], fail_handle_1, out_type_1)

        printMessage("Done", start_time=start_time, end=True)

    # Print log
    log = OrderedDict()
    log['OUTPUT1'] = os.path.basename(pass_handle_1.name)
    log['OUTPUT2'] = os.path.basename(pass_handle_2.name)
    log['SEQUENCES1'] = seq_count_1
    log['SEQUENCES2'] = seq_count_2
    log['PASS'] = pair_count
    log['END'] = 'PairSeq'
    printLog(log)

    # Close file handles
    pass_handle_1.close()
    pass_handle_2.close()

    return [(pass_handle_1.name, pass_handle_2.name)]