def snpMatrixGenerator(sourceFile,
                       destFile,
                       recordAll=False,
                       recordRandomSample=True):
    if recordAll == recordRandomSample:
        print "Invalid Options"
        exit()

    destNexus = NexusWriter()

    block = ""
    snpCol = 0
    for line in sourceFile:
        if all(x in line.lower() for x in {"begin", "data"}):
            sourceNexus = NexusReader()
            sourceNexus.read_string(block)
            if "data" in sourceNexus.blocks:
                snpCol = _findDifferences(sourceNexus, destNexus, snpCol,
                                          recordAll, recordRandomSample)
            block = line
        else:
            block += line

    sourceNexus = NexusReader()
    sourceNexus.read_string(block)
    if "data" in sourceNexus.blocks:
        snpCol = _findDifferences(sourceNexus, destNexus, snpCol, recordAll,
                                  recordRandomSample)

    destFile.write(destNexus.make_nexus() + '\n')

    destFile.close()
    sourceFile.close()
def snpMatrixGenerator(sourceFile, destFile, recordAll=False,
                       recordRandomSample=True):
    if recordAll == recordRandomSample:
        print "Invalid Options"
        exit()

    destNexus = NexusWriter()

    block = ""
    snpCol = 0
    for line in sourceFile:
        if all(x in line.lower() for x in {"begin", "data"}):
            sourceNexus = NexusReader()
            sourceNexus.read_string(block)
            if "data" in sourceNexus.blocks:
                snpCol = _findDifferences(sourceNexus, destNexus, snpCol,
                               recordAll, recordRandomSample)
            block = line
        else:
            block += line

    sourceNexus = NexusReader()
    sourceNexus.read_string(block)
    if "data" in sourceNexus.blocks:
        snpCol = _findDifferences(sourceNexus, destNexus, snpCol,
                       recordAll, recordRandomSample)

    destFile.write(destNexus.make_nexus() + '\n')

    destFile.close()
    sourceFile.close()
Example #3
0
 def test_regression_format_string_has_quoted_symbols(self):
     """Regression: Symbols in the format string should be quoted"""
     nex = NexusWriter()
     for char, b in data.items():
         for taxon, value in b.items():
             nex.add(taxon, char, value)
     out = nex.make_nexus()
     assert 'SYMBOLS="123456"' in out
 def test_regression_format_string_has_quoted_symbols(self):
     """Regression: Symbols in the format string should be quoted"""
     nex = NexusWriter()
     for char, b in data.items():
         for taxon, value in b.items():
             nex.add(taxon, char, value)
     out = nex.make_nexus()
     assert 'SYMBOLS="123456"' in out
Example #5
0
class Test_NexusWriter_2(unittest.TestCase):
    def setUp(self):
        self.nex = NexusWriter()
        for char, b in data.items():
            for taxon, value in b.items():
                self.nex.add(taxon, char, value)

    def test_nexus_noninterleave(self):
        """Test Nexus Generation - Non-Interleaved"""
        n = self.nex.make_nexus(interleave=False)
        assert re.search("#NEXUS", n)
        assert re.search("BEGIN DATA;", n)
        assert re.search("DIMENSIONS NTAX=3 NCHAR=2;", n)
        assert re.search("MATRIX", n)
        assert re.search("Latin\s+36", n)
        assert re.search("French\s+14", n)
        assert re.search("English\s+25", n)
        assert re.search("FORMAT.*MISSING\=(.+?)", n).groups()[0] == '?'
        assert re.search("FORMAT.*DATATYPE\=(\w+)\s", n).groups()[0] == 'STANDARD'
        assert re.search('FORMAT.*SYMBOLS\="(\d+)";', n).groups()[0] == '123456'
        
    def test_nexus_charblock(self):
        """Test Nexus Generation - with characters block"""
        n = self.nex.make_nexus(charblock=True)
        assert re.search("#NEXUS", n)
        assert re.search("BEGIN DATA;", n)
        assert re.search("DIMENSIONS NTAX=3 NCHAR=2;", n)
        assert re.search("CHARSTATELABELS", n)
        assert re.search("1 char1,", n)
        assert re.search("2 char2", n)
        assert re.search("MATRIX", n)
        assert re.search("Latin\s+36", n)
        assert re.search("French\s+14", n)
        assert re.search("English\s+25", n)
        assert re.search("FORMAT.*MISSING\=(.+?)", n).groups()[0] == '?'
        assert re.search("FORMAT.*DATATYPE\=(\w+)\s", n).groups()[0] == 'STANDARD'
        assert re.search('FORMAT.*SYMBOLS\="(\d+)";', n).groups()[0] == '123456'
    
    def test_nexus_interleave(self):
        """Test Nexus Generation - Interleaved"""
        n = self.nex.make_nexus(interleave=True)
        assert re.search("#NEXUS", n)
        assert re.search("BEGIN DATA;", n)
        assert re.search("DIMENSIONS NTAX=3 NCHAR=2;", n)
        assert re.search("MATRIX", n)
        # char1
        assert re.search("Latin\s+3", n)
        assert re.search("French\s+1", n)
        assert re.search("English\s+2", n)
        # char2
        assert re.search("Latin\s+6", n)
        assert re.search("French\s+4", n)
        assert re.search("English\s+5", n)
        
        assert re.search("FORMAT.*MISSING\=(.+?)", n).groups()[0] == '?'
        assert re.search("FORMAT.*DATATYPE\=(\w+)\s", n).groups()[0] == 'STANDARD'
        assert re.search("FORMAT.*(INTERLEAVE)", n).groups()[0] == 'INTERLEAVE'
        assert re.search('FORMAT.*SYMBOLS\="(\d+)";', n).groups()[0] == '123456'
Example #6
0
 def test_regression_format_string_has_datatype_first(self):
     """Regression: Format string should contain 'datatype' as the first element"""
     # SplitsTree complains otherwise.
     nex = NexusWriter()
     for char, b in data.items():
         for taxon, value in b.items():
             nex.add(taxon, char, value)
     out = nex.make_nexus()
     assert "FORMAT DATATYPE=STANDARD" in out
Example #7
0
 def test_regression_format_string_has_datatype_first(self):
     """Regression: Format string should contain 'datatype' as the first element"""
     # SplitsTree complains otherwise.
     nex = NexusWriter()
     for char, b in data.items():
         for taxon, value in b.items():
             nex.add(taxon, char, value)
     out = nex.make_nexus()
     assert "FORMAT DATATYPE=STANDARD" in out
Example #8
0
def combine_nexuses(*nexuslist):
    """
    Combines a list of NexusReader instances into a single nexus

    :param nexuslist: A list of NexusReader instances

    :return: A NexusWriter instance
    """
    if len(nexuslist) == 1 and isinstance(nexuslist[0], (list, tuple)):
        nexuslist = nexuslist[0]
    nexuslist = [get_nexus_reader(nex) for nex in nexuslist]
    out = NexusWriter()
    # check they're all nexus instances and get all block types
    blocks = set()
    for nex in nexuslist:
        blocks.update(list(nex.blocks))

    for block in blocks:
        if block == 'data':
            out = combine_datablocks(out, nexuslist)
        elif block == 'trees':
            out = combine_treeblocks(out, nexuslist)
        else:  # pragma: no cover
            raise ValueError("Don't know how to combine %s blocks" % block)
    return out
Example #9
0
def nexus(args):  # pragma: no cover
    usage = """
    Convert a lexibank dataset to nexus.

        lexibank nexus <DATASET> --output=...
    """
    get_dataset(args)
    parser = argparse.ArgumentParser(prog='nexus', usage=usage)
    parser.add_argument('--output', help='Nexus output file', default=None)
    xargs = parser.parse_args(args.args[1:])

    writer = NexusWriter()

    if not xargs.output:
        print(writer.write())
    else:
        writer.write_to_file(filename=xargs.output)
Example #10
0
def binarise(nexus_obj, one_nexus_per_block=False):
    """
    Returns a binary variant of the given `nexus_obj`.
    If `one_nexus_per_block` then we return a list of NexusWriter instances.

    :param nexus_obj: A `NexusReader` instance
    :type nexus_obj: NexusReader 

    :param one_nexus_per_block: Whether to return a single NexusWriter, or a 
                                list of NexusWriter's (one per character)
    :type one_nexus_per_block: Boolean

    :return: A NexusWriter instance or a list of NexusWriter instances.
    :raises AssertionError: if nexus_obj is not a nexus
    :raises NexusFormatException: if nexus_obj does not have a `data` block
    """
    check_for_valid_NexusReader(nexus_obj, required_blocks=['data'])
    nexuslist = []
    n = NexusWriter()
    
    for i in sorted(nexus_obj.data.charlabels):
        label = nexus_obj.data.charlabels[i] # character label
        char = nexus_obj.data.characters[label] # character dict (taxon->state)
        recoding = _recode_to_binary(char) # recode
        
        new_char_length = len(recoding[recoding.keys()[0]])
        
        # loop over recoded data
        for j in range(new_char_length):
            for taxon, state in recoding.items():
                # make new label
                new_label = "%s_%d" % (str(label), j)
                # add to nexus
                n.add(taxon, new_label, state[j])
            
        if one_nexus_per_block:
            nexuslist.append(n)
            n = NexusWriter()
                
    if one_nexus_per_block:
        return nexuslist
    else:
        return n
Example #11
0
class Test_NexusWriter_1(unittest.TestCase):
    def setUp(self):
        self.nex = NexusWriter()

    def test_char_adding1(self):
        """Test Character Addition 1"""
        for tx, value in data['char1'].items():
            self.nex.add(tx, 'char1', value)
        assert self.nex.data['char1']['French'] == '1'
        assert self.nex.data['char1']['English'] == '2'
        assert self.nex.data['char1']['Latin'] == '3'

    def test_char_adding2(self):
        """Test Character Addition 2"""
        for tx, value in data['char2'].items():
            self.nex.add(tx, 'char2', value)
        assert self.nex.data['char2']['French'] == '4'
        assert self.nex.data['char2']['English'] == '5'
        assert self.nex.data['char2']['Latin'] == '6'
Example #12
0
class Test_NexusWriter_1(unittest.TestCase):
    def setUp(self):
        self.nex = NexusWriter()
        
    def test_char_adding1(self):
        """Test Character Addition 1"""
        for tx, value in data['char1'].items():
            self.nex.add(tx, 'char1', value)
        assert self.nex.data['char1']['French'] == '1'
        assert self.nex.data['char1']['English'] == '2'
        assert self.nex.data['char1']['Latin'] == '3'
        
    def test_char_adding2(self):
        """Test Character Addition 2"""
        for tx, value in data['char2'].items():
            self.nex.add(tx, 'char2', value)
        assert self.nex.data['char2']['French'] == '4'
        assert self.nex.data['char2']['English'] == '5'
        assert self.nex.data['char2']['Latin'] == '6'
Example #13
0
def shufflenexus(nexus_obj, resample=False):
    """
    Shuffles the characters between each taxon to create a new nexus

    :param nexus_obj: A `NexusReader` instance
    :type nexus_obj: NexusReader 

    :param resample: The number of characters to resample. If set to False, then
        the number of characters will equal the number of characters in the 
        original data file.
    :type resample: Integer

    :return: A shuffled NexusReader instance
    :raises AssertionError: if nexus_obj is not a nexus
    :raises ValueError: if resample is not False or a positive Integer
    :raises NexusFormatException: if nexus_obj does not have a `data` block
    """
    check_for_valid_NexusReader(nexus_obj, required_blocks=['data'])

    if resample is False:
        resample = nexus_obj.data.nchar

    try:
        resample = int(resample)
    except ValueError:
        raise ValueError('resample must be a positive integer or False!')

    if resample < 1:
        raise ValueError('resample must be a positive integer or False!')

    newnexus = NexusWriter()
    newnexus.add_comment("Randomised Nexus generated from %s" % nexus_obj.filename)

    for i in range(resample):
        # pick existing character
        character = randrange(0, nexus_obj.data.nchar)
        chars = nexus_obj.data.characters[character]
        site_values = [chars[taxon] for taxon in nexus_obj.data.taxa]
        shuffle(site_values)
        for taxon in nexus_obj.data.taxa:
            newnexus.add(taxon, i, site_values.pop(0))
    return newnexus
Example #14
0
def new_nexus_without_sites(nexus_obj, sites_to_remove):
    """
    Returns a new NexusReader instance with the sites in 
    `sites_to_remove` removed.

    :param nexus_obj: A `NexusReader` instance
    :type nexus_obj: NexusReader 

    :param sites_to_remove: A list of site numbers
    :type sites_to_remove: List

    :return: A NexusWriter instance
    :raises AssertionError: if nexus_obj is not a nexus
    :raises NexusFormatException: if nexus_obj does not have a `data` block
    """
    check_for_valid_NexusReader(nexus_obj, required_blocks=['data'])

    # make new nexus
    nexout = NexusWriter()
    nexout.add_comment(
        "Removed %d sites: %s" %
        (len(sites_to_remove), ",".join(["%s" % s for s in sites_to_remove])))
    new_sitepos = 0
    for sitepos in range(nexus_obj.data.nchar):
        if sitepos in sites_to_remove:
            continue  # skip!
        for taxon, data in nexus_obj.data:
            nexout.add(taxon, new_sitepos, data[sitepos])
        new_sitepos += 1
    return nexout
Example #15
0
def create_nexus(bt_matrix, sample_list):
    '''
    create NexusWriter object from BedTool matrix
    BedTool object apparently can contain empty lines, a simple length-check skips these

    requires pybedtools, nexus
    '''

    n = NexusWriter()
    if isinstance(bt_matrix, BedTool):
        matrix_lines = str(bt_matrix).split(
            "\n")  # create lines from BedTool object
    else:
        matrix_lines = bt_matrix
    i = 0
    current_chromosome = ""
    for line in matrix_lines:
        line = line.split("\t")

        if len(line) < 4:  #check for empty line, why 4? it works
            print "skipping: line empty:  %s" % ",".join(line)
            continue

        chrom, start, end, samples_present = line[0], line[1], line[2], line[
            3].split(",")  # create locus and presnt_ssample objects
        locus = "%s:%i - %i" % (chrom, int(start), int(end))  # define locus
        i += 1

        if chrom != current_chromosome:
            print chrom
            current_chromosome = chrom
        else:
            pass

        for taxon in sample_list:  # iterate over taxons
            presence = 1 if taxon in samples_present else 0
            n.add(taxon, locus, presence)

    return n
def make_nexus(model_dict):
    sequence = []
    n = NexusWriter()
    dictline = {}
    for dictline in model_dict:
        lang = dictline.get('Language')
        isocode = dictline.get('ISOCODE')
        kvec = ""
        for k in sorted(dictline):
            #print(k)
            if (k == 'Language' or k == 'ISOCODE'):
                continue
            else:
                k = repr(dictline.get(k))
                krem = k.replace("'", "")
                kmod = krem[2:36]
                kvec = kvec + kmod.zfill(35)


# label is number of elements in vector, number of taxa and a random 1
        label = "%s_%s_%d" % ("756", "51", 1)
        n.add(lang, label, kvec)
    return n
Example #17
0
def binarise(nexus_obj, one_nexus_per_block=False):
    """
    Returns a binary variant of the given `nexus_obj`.
    If `one_nexus_per_block` then we return a list of NexusWriter instances.

    :param nexus_obj: A `NexusReader` instance
    :type nexus_obj: NexusReader 

    :param one_nexus_per_block: Whether to return a single NexusWriter, or a 
                                list of NexusWriter's (one per character)
    :type one_nexus_per_block: Boolean

    :return: A NexusWriter instance or a list of NexusWriter instances.
    :raises AssertionError: if nexus_obj is not a nexus
    :raises NexusFormatException: if nexus_obj does not have a `data` block
    """
    check_for_valid_NexusReader(nexus_obj, required_blocks=['data'])
    nexuslist = []
    n = NexusWriter()

    for i in sorted(nexus_obj.data.charlabels):
        label = nexus_obj.data.charlabels[i]  # character label
        char = nexus_obj.data.characters[
            label]  # character dict (taxon->state)
        recoding = _recode_to_binary(char)  # recode

        new_char_length = len(recoding[recoding.keys()[0]])

        # loop over recoded data
        for j in range(new_char_length):
            for taxon, state in recoding.items():
                # make new label
                new_label = "%s_%d" % (str(label), j)
                # add to nexus
                n.add(taxon, new_label, state[j])

        if one_nexus_per_block:
            nexuslist.append(n)
            n = NexusWriter()

    if one_nexus_per_block:
        return nexuslist
    else:
        return n
Example #18
0
def shufflenexus(nexus_obj, resample=False):
    """
    Shuffles the characters between each taxon to create a new nexus

    :param nexus_obj: A `NexusReader` instance
    :type nexus_obj: NexusReader 

    :param resample: The number of characters to resample. If set to False, then
        the number of characters will equal the number of characters in the 
        original data file.
    :type resample: Integer

    :return: A shuffled NexusReader instance
    :raises AssertionError: if nexus_obj is not a nexus
    :raises ValueError: if resample is not False or a positive Integer
    :raises NexusFormatException: if nexus_obj does not have a `data` block
    """
    check_for_valid_NexusReader(nexus_obj, required_blocks=['data'])

    if resample is False:
        resample = nexus_obj.data.nchar

    try:
        resample = int(resample)
    except ValueError:
        raise ValueError('resample must be a positive integer or False!')

    if resample < 1:
        raise ValueError('resample must be a positive integer or False!')

    newnexus = NexusWriter()
    newnexus.add_comment("Randomised Nexus generated from %s" %
                         nexus_obj.filename)

    for i in range(resample):
        # pick existing character
        character = randrange(0, nexus_obj.data.nchar)
        chars = nexus_obj.data.characters[character]
        site_values = [chars[taxon] for taxon in nexus_obj.data.taxa]
        shuffle(site_values)
        for taxon in nexus_obj.data.taxa:
            newnexus.add(taxon, i, site_values.pop(0))
    return newnexus
Example #19
0
def multistatise(nexus_obj):
    """
    Returns a multistate variant of the given `nexus_obj`.

    :param nexus_obj: A `NexusReader` instance
    :type nexus_obj: NexusReader 

    :return: A NexusReader instance
    :raises AssertionError: if nexus_obj is not a nexus
    :raises NexusFormatException: if nexus_obj does not have a `data` block
    """
    check_for_valid_NexusReader(nexus_obj, required_blocks=['data'])
        
    site_idx = 0
    nexout = NexusWriter()
    missing = []
    
    charlabel = getattr(nexus_obj, 'short_filename', 1)
    
    for site, data in nexus_obj.data.characters.items():
        multistate_value = chr(65 + site_idx)
        for taxon, value in data.items():
            assert value == str(value)
            if value in ('?', '-'):
                missing.append(taxon)
                
            if value == '1':
                nexout.add(taxon, charlabel, multistate_value)
                if taxon in missing: # remove taxon if we've seen a non-? entry
                    missing.remove(taxon)
        site_idx += 1
        assert site_idx < 26, "Too many characters to handle! - run out of A-Z"
        
    # add missing state for anything that is all missing, and has not been
    # observed anywhere
    for taxon in nexus_obj.data.taxa:
        if taxon not in nexout.data[str(charlabel)]:
            nexout.add(taxon, charlabel, '?')
    return nexout._convert_to_reader()
Example #20
0
def combine_nexuses(nexuslist):
    """
    Combines a list of NexusReader instances into a single nexus
    
    :param nexuslist: A list of NexusReader instances
    :type nexuslist: List 
    
    :return: A NexusWriter instance
    
    :raises TypeError: if nexuslist is not a list of NexusReader instances
    :raises IOError: if unable to read an file in nexuslist
    :raises NexusFormatException: if a nexus file does not have a `data` block
    """
    out = NexusWriter()
    charpos = 0
    for nex_id, nex in enumerate(nexuslist, 1):
        check_for_valid_NexusReader(nex, required_blocks=['data'])

        if hasattr(nex, 'short_filename'):
            nexus_label = os.path.splitext(nex.short_filename)[0]
        elif hasattr(nex, 'label'):
            nexus_label = nex.label
        else:
            nexus_label = str(nex_id)

        out.add_comment("%d - %d: %s" %
                        (charpos, charpos + nex.data.nchar - 1, nexus_label))
        for site_idx, site in enumerate(sorted(nex.data.characters), 0):
            data = nex.data.characters.get(site)
            charpos += 1
            # work out character label
            charlabel = nex.data.charlabels.get(site_idx, site_idx + 1)
            label = '%s.%s' % (nexus_label, charlabel)

            for taxon, value in data.items():
                out.add(taxon, label, value)
    return out
Example #21
0
class Test_NexusWriter_2(unittest.TestCase):
    def setUp(self):
        self.nex = NexusWriter()
        for char, b in data.items():
            for taxon, value in b.items():
                self.nex.add(taxon, char, value)

    def test_nexus_noninterleave(self):
        """Test Nexus Generation - Non-Interleaved"""
        n = self.nex.make_nexus(interleave=False)
        assert re.search("#NEXUS", n)
        assert re.search("BEGIN DATA;", n)
        assert re.search("DIMENSIONS NTAX=3 NCHAR=2;", n)
        assert re.search("MATRIX", n)
        assert re.search("Latin\s+36", n)
        assert re.search("French\s+14", n)
        assert re.search("English\s+25", n)
        assert re.search("FORMAT.*MISSING\=(.+?)", n).groups()[0] == '?'
        assert re.search("FORMAT.*DATATYPE\=(\w+)\s",
                         n).groups()[0] == 'STANDARD'
        assert re.search('FORMAT.*SYMBOLS\="(\d+)";',
                         n).groups()[0] == '123456'

    def test_nexus_charblock(self):
        """Test Nexus Generation - with characters block"""
        n = self.nex.make_nexus(charblock=True)
        assert re.search("#NEXUS", n)
        assert re.search("BEGIN DATA;", n)
        assert re.search("DIMENSIONS NTAX=3 NCHAR=2;", n)
        assert re.search("CHARSTATELABELS", n)
        assert re.search("1 char1,", n)
        assert re.search("2 char2", n)
        assert re.search("MATRIX", n)
        assert re.search("Latin\s+36", n)
        assert re.search("French\s+14", n)
        assert re.search("English\s+25", n)
        assert re.search("FORMAT.*MISSING\=(.+?)", n).groups()[0] == '?'
        assert re.search("FORMAT.*DATATYPE\=(\w+)\s",
                         n).groups()[0] == 'STANDARD'
        assert re.search('FORMAT.*SYMBOLS\="(\d+)";',
                         n).groups()[0] == '123456'

    def test_nexus_interleave(self):
        """Test Nexus Generation - Interleaved"""
        n = self.nex.make_nexus(interleave=True)
        assert re.search("#NEXUS", n)
        assert re.search("BEGIN DATA;", n)
        assert re.search("DIMENSIONS NTAX=3 NCHAR=2;", n)
        assert re.search("MATRIX", n)
        # char1
        assert re.search("Latin\s+3", n)
        assert re.search("French\s+1", n)
        assert re.search("English\s+2", n)
        # char2
        assert re.search("Latin\s+6", n)
        assert re.search("French\s+4", n)
        assert re.search("English\s+5", n)

        assert re.search("FORMAT.*MISSING\=(.+?)", n).groups()[0] == '?'
        assert re.search("FORMAT.*DATATYPE\=(\w+)\s",
                         n).groups()[0] == 'STANDARD'
        assert re.search("FORMAT.*(INTERLEAVE)", n).groups()[0] == 'INTERLEAVE'
        assert re.search('FORMAT.*SYMBOLS\="(\d+)";',
                         n).groups()[0] == '123456'
Example #22
0
 def setUp(self):
     self.nex = NexusWriter()
Example #23
0
 def setUp(self):
     self.nex = NexusWriter()
Example #24
0
def create_nexus(bt_matrix, sample_list, absence_code=0, has_missing=False):
    '''
    create NexusWriter object from BedTool matrix
    BedTool object apparently can contain empty lines, a simple length-check skips these

    :param has_missing indicate whether input BED file should be parsed for uncertainly called loci, that will be coded as "?"
    requires pybedtools, nexus
    '''
    from nexus import NexusWriter
    n = NexusWriter()
    # if isinstance(bt_matrix, BedTool):
    #     matrix_lines = str(bt_matrix).split("\n")  # create lines from BedTool object
    # else:
    #     matrix_lines = str(bt_matrix).split("\n")
    matrix_lines = str(bt_matrix).split("\n")
    current_chrom = ""
    if has_missing:
        print u"[ {} ] adding missing characters".format(modulename)

    for line in matrix_lines:
        line = line.split("\t")

        if len(line) < 4:  # check for incomplete line
            print u"[ {} ]".format(modulename),
            print u"skipping incomplete or empty line:  %s" % ",".join(line)
            continue

        if not has_missing:

            chrom, start, end, samples_present = line[0], line[1], line[
                2], line[3].split(
                    ",")  # create locus and present_sample objects
            locus = "%s_%i_%i" % (chrom, int(start), int(end))  # define locus

            if current_chrom != chrom:
                print u"[ {} ] Operating on".format(modulename),
                print chrom
                current_chrom = chrom
            # add locus to nexus object
            for taxon in sample_list:  # iterate over taxons
                char = 1 if taxon in samples_present else absence_code
                n.add(taxon, locus, char)

        elif has_missing:
            chrom, start, end, samples_present, type, samples_missing = line[
                0], line[1], line[2], line[3].split(
                    ","), line[4], line[5].split(
                        ",")  # create locus and present_sample objects
            locus = "%s_%i_%i" % (chrom, int(start), int(end))  # define locus

            if current_chrom != chrom:
                print u"[ {} ] Operating on".format(modulename),
                print chrom
                current_chrom = chrom

            # add locus to nexus object
            for taxon in sample_list:  # iterate over taxons
                if taxon in samples_present:
                    char = 1
                elif taxon in samples_missing:
                    char = "?"
                else:
                    char = absence_code
                n.add(taxon, locus, char)

    return n
 def setUp(self):
     self.nex = NexusWriter()
     for char in data:
         for taxon, value in data[char].items():
             self.nex.add(taxon, char, value)
Example #26
0
 def setUp(self):
     self.nex = NexusWriter()
     for char, b in data.items():
         for taxon, value in b.items():
             self.nex.add(taxon, char, value)
class Test_NexusWriter(unittest.TestCase):
    def setUp(self):
        self.nex = NexusWriter()
        for char in data:
            for taxon, value in data[char].items():
                self.nex.add(taxon, char, value)
    
    def test_char_adding1(self):
        """Test Character Addition 1"""
        assert self.nex.data['char1']['French'] == '1'
        assert self.nex.data['char1']['English'] == '2'
        assert self.nex.data['char1']['Latin'] == '3'

    def test_char_adding2(self):
        """Test Character Addition 2"""
        assert self.nex.data['char2']['French'] == '4'
        assert self.nex.data['char2']['English'] == '5'
        assert self.nex.data['char2']['Latin'] == '6'
    
    def test_char_adding_integer(self):
        """Test Character Addition as integer"""
        self.nex.add('French', 'char3', 9)
        self.nex.add('English', 'char3', '9')
        assert self.nex.data['char3']['French'] == '9'
        assert self.nex.data['char3']['French'] == '9'
    
    def test_characters(self):
        assert 'char1' in self.nex.characters
        assert 'char2' in self.nex.characters
        
    def test_taxa(self):
        assert 'French' in self.nex.taxa
        assert 'English' in self.nex.taxa
        assert 'Latin' in self.nex.taxa
    
    def test_remove(self):
        self.nex.remove("French", "char2")
        assert 'French' not in self.nex.data['char2']
        assert 'French' in self.nex.taxa
        
    def test_remove_character(self):
        self.nex.remove_character("char2")
        assert len(self.nex.characters) == 1
        assert 'char2' not in self.nex.data
        
    def test_remove_taxon(self):
        self.nex.remove_taxon("French")
        assert 'French' not in self.nex.taxa
        for char in self.nex.data:
            assert 'French' not in self.nex.data[char]
        n = self.nex.make_nexus(interleave=False)
        assert re.search("DIMENSIONS NTAX=2 NCHAR=2;", n)
        assert 'French' not in n
        
    def test_nexus_noninterleave(self):
        """Test Nexus Generation - Non-Interleaved"""
        n = self.nex.make_nexus(interleave=False)
        assert re.search("#NEXUS", n)
        assert re.search("BEGIN DATA;", n)
        assert re.search("DIMENSIONS NTAX=3 NCHAR=2;", n)
        assert re.search("MATRIX", n)
        assert re.search("Latin\s+36", n)
        assert re.search("French\s+14", n)
        assert re.search("English\s+25", n)
        assert re.search("FORMAT.*MISSING\=(.+?)", n).groups()[0] == '?'
        assert re.search("FORMAT.*DATATYPE\=(\w+)\s", n).groups()[0] \
            == 'STANDARD'
        assert re.search('FORMAT.*SYMBOLS\="(\d+)";', n).groups()[0] \
            == '123456'

    def test_nexus_charblock(self):
        """Test Nexus Generation - with characters block"""
        n = self.nex.make_nexus(charblock=True)
        assert re.search("#NEXUS", n)
        assert re.search("BEGIN DATA;", n)
        assert re.search("DIMENSIONS NTAX=3 NCHAR=2;", n)
        assert re.search("CHARSTATELABELS", n)
        assert re.search("1 char1,", n)
        assert re.search("2 char2", n)
        assert re.search("MATRIX", n)
        assert re.search("Latin\s+36", n)
        assert re.search("French\s+14", n)
        assert re.search("English\s+25", n)
        assert re.search("FORMAT.*MISSING\=(.+?)", n).groups()[0] == '?'
        assert re.search("FORMAT.*DATATYPE\=(\w+)\s", n).groups()[0] \
            == 'STANDARD'
        assert re.search('FORMAT.*SYMBOLS\="(\d+)";', n).groups()[0] \
            == '123456'

    def test_nexus_interleave(self):
        """Test Nexus Generation - Interleaved"""
        n = self.nex.make_nexus(interleave=True)
        assert re.search("#NEXUS", n)
        assert re.search("BEGIN DATA;", n)
        assert re.search("DIMENSIONS NTAX=3 NCHAR=2;", n)
        assert re.search("MATRIX", n)
        # char1
        assert re.search("Latin\s+3", n)
        assert re.search("French\s+1", n)
        assert re.search("English\s+2", n)
        # char2
        assert re.search("Latin\s+6", n)
        assert re.search("French\s+4", n)
        assert re.search("English\s+5", n)

        assert re.search("FORMAT.*MISSING\=(.+?)", n).groups()[0] == '?'
        assert re.search("FORMAT.*DATATYPE\=(\w+)\s", n).groups()[0] == \
            'STANDARD'
        assert re.search("FORMAT.*(INTERLEAVE)", n).groups()[0] == \
            'INTERLEAVE'
        assert re.search('FORMAT.*SYMBOLS\="(\d+)";', n).groups()[0] == \
            '123456'
            
    def test_polymorphic_characters(self):
        self.nex.add("French", "char1", 2)
        self.assertEqual(self.nex.data['char1']['French'], "12")
        n = self.nex.make_nexus(charblock=True)
        assert re.search("DIMENSIONS NTAX=3 NCHAR=2;", n)  # no change
        assert re.search("French\s+\(12\)4", n)
    
    def test_write_to_file(self):
        tmp = NamedTemporaryFile(delete=False, suffix=".nex")
        tmp.close()
        self.nex.write_to_file(tmp.name)
        assert os.path.isfile(tmp.name)
        
        with open(tmp.name, 'r') as handle:
            n = handle.read()
            
        assert re.search("#NEXUS", n)
        assert re.search("BEGIN DATA;", n)
        assert re.search("DIMENSIONS NTAX=3 NCHAR=2;", n)
        assert re.search("MATRIX", n)
        assert re.search("Latin\s+36", n)
        assert re.search("French\s+14", n)
        assert re.search("English\s+25", n)
        assert re.search("FORMAT.*MISSING\=(.+?)", n).groups()[0] == '?'
        assert re.search("FORMAT.*DATATYPE\=(\w+)\s", n).groups()[0] \
            == 'STANDARD'
        assert re.search('FORMAT.*SYMBOLS\="(\d+)";', n).groups()[0] \
            == '123456'
        
        os.unlink(tmp.name)        # cleanup

    def test_write_as_table(self):
        content = self.nex.write_as_table()
        assert re.search("Latin\s+36", content)
        assert re.search("French\s+14", content)
        assert re.search("English\s+25", content)
        assert len(content.split("\n")) == 3
        
    def test_write_as_table_with_polymorphoc(self):
        self.nex.add('French', 'char1', '2')
        content = self.nex.write_as_table()
        assert re.search("Latin\s+36", content)
        assert re.search("French\s+\(12\)4", content)
        assert re.search("English\s+25", content)
        assert len(content.split("\n")) == 3
Example #28
0
  def read_file(self):
  
    with open( self.input_file, 'r') as f:
      self.first_line = f.readline()
   
      # TODO: Needs work xread / tnt file format is loose

      # xread has some other potential clues
      xread_filename = f.readline().strip().replace("'", "")
      xread_matrix_dimensions = f.readline()
      lines = f.readlines()

      f.close

    if "#NEXUS" == self.first_line.strip():
      print "text file is nexus"
  
      # move to nexus folder
      filename, file_extension = os.path.splitext(self.input_file)
      os.rename(self.input_file, filename + ".nex")

      # send to correct matrix handler
      # TODO: A little dirty, probably should 
      # change mediator's handler param so this flows cleaner
      matrixHandler = NexusHandler( filename + ".nex" )
      return matrixHandler.read_file()

    elif "xread" == self.first_line.strip():
    
      print "xread format found"

      dimensions = str(xread_matrix_dimensions).split(' ')
      self.ncols = int(dimensions[0])
      self.nrows = int(dimensions[1])

      custom_block = "\n\nBEGIN VERIFIED_TAXA;\n"
      custom_block += "Dimensions ntax=" + str(self.nrows) + " nchar=4;\n"

      matrix = ""
      matrix_arr = []
      line_buffer = ""
      row_taxa = []
      for l in lines:
        if ";proc/;" != l.strip():

          if line_buffer:
            line_row = line_buffer + " " + l.strip()
          else:
            line_row = l.strip()
        
          if len(line_row) >= self.ncols:
 
            # reconstitute broken rows, then remove space/tabbing
            line_parts = line_row.split(' ')
            line_parts = list(filter(None, line_parts))

            taxon_name = line_parts[0]
            taxon_chars = line_parts[1]
            #taxon_chars = line_parts[1].replace("[", "(")
            #taxon_chars = taxon_chars.replace("]", ")")
          
            #  verify taxa
            verified_taxa = verifyTaxa(taxon_name)
            verified_name = None

            if verified_taxa:
              for taxa in verified_taxa:

                # We split here to exclude the odd citation on the taxon name ( maybe regex what looks like name & name, year would be better )
                verified_name = taxa['name_string'].lower().split(' ')
                row_taxa.append( verified_name[0] )

                custom_block += taxon_name + "    " + taxa['name_string'] + "    " + taxa['match_value'] + "    " + taxa['datasource'] + "\n"

              matrix += "    " + verified_name[0] + "    " + taxon_chars.strip() + "\n"
              matrix_arr.append(taxon_chars.strip())
            else:

              row_taxa.append( taxon_name )
              custom_block += taxon_name + "\n"
              matrix += "    " + taxon_name + "    " + taxon_chars.strip() + "\n"
              matrix_arr.append(taxon_chars.strip())

            line_buffer = ""
          else:
            line_buffer += l.strip()

      custom_block += ";\n"
      custom_block += "END;\n"

      self.custom_block = custom_block      

      print "matrix array"
      marr = []
      for row in matrix_arr:
        items = list(row)
        marr.append(items)

      m = numpy.matrix(marr)
       
      nw = NexusWriter()
      nw.add_comment("Morphobank generated Nexus from xread .txt file ")

      for rx in range(self.nrows):
        taxon_name = row_taxa[rx] 
        cell_value = m.item(rx)

        for cindex, cv in enumerate(cell_value):
          char_no = cindex + 1
          nw.add(taxon_name, char_no, cv)


      # keep the upload path and filename, but change extension
      file_parts = self.input_file.split('.')
      output_file = file_parts[0] + '.nex'

      nw.write_to_file(filename=output_file, interleave=False, charblock=True)

      # move to nexus folder
      #os.rename(xread_filename + ".nex", "./nexus/" + xread_filename + ".nex")

      # wait for file to move before open and append
      #while not os.path.exists('./nexus/' + xread_filename + '.nex'):
      #  time.sleep(1)
 
      #if os.path.isfile('./nexus/' + xread_filename + '.nex'):

      # Custom Block Section
      nexus_file = codecs.open(output_file, 'a', 'utf-8')
      nexus_file.write(custom_block)
      nexus_file.close()      

      return output_file

    else:
      print "do not know how to process this .txt file"
Example #29
0
  def read_file(self):

    book = xlrd.open_workbook(self.input_file)
    sh = book.sheet_by_index(0)

    nw = NexusWriter()

    self.nrows = sh.nrows
    self.ncols = sh.ncols

    nw.add_comment("Morphobank generated Nexus File")
  
    custom_block = "\n\nBEGIN VERIFIED_TAXA;\n"
    custom_block += "Dimensions ntax=" + str(self.nrows) + " nchar=4;\n"

    # Species List Taxa
    for rx in range(self.nrows):
      if rx:
    
        taxon_name = str(sh.cell_value(rowx=rx, colx=0)).strip()
   
        verified_taxa = verifyTaxa(taxon_name)
        verified_name = None
        
        if verified_taxa:
          for taxa in verified_taxa:
            verified_name = taxa['name_string'].lower()
            custom_block += taxon_name + "    " + taxa['name_string'] + "    " + taxa['match_value'] + "    " + taxa['datasource'] + "\n"
        else:
          custom_block += taxon_name + "\n"
    
        for cx in range(self.ncols):
          if cx:

            if is_number( sh.cell_value(rowx=rx, colx=cx)):
              cell_value = int(sh.cell_value(rowx=rx, colx=cx))
            else:
              cell_value = sh.cell_value(rowx=rx, colx=cx)
              cell_value = cell_value.replace("{", "(")
              cell_value = cell_value.replace("}", ")")

            if verified_name:
              nw.add(verified_name, cx, cell_value)
            else:
              nw.add(taxon_name, cx, cell_value)
  
    custom_block += ";\n"
    custom_block += "END;\n"

    self.custom_block = custom_block

    # keep the upload path and filename, but change extension
    file_parts = self.input_file.split('.')
    output_file = file_parts[0] + '.nex'
    nw.write_to_file(filename=file_parts[0] + '.nex', interleave=False, charblock=True)

    ## quick append Custom Block 
    nexus_file = open(output_file, 'a')
    nexus_file.write(custom_block)
    nexus_file.close()

    return output_file