def snpMatrixGenerator(sourceFile, destFile, recordAll=False, recordRandomSample=True): if recordAll == recordRandomSample: print "Invalid Options" exit() destNexus = NexusWriter() block = "" snpCol = 0 for line in sourceFile: if all(x in line.lower() for x in {"begin", "data"}): sourceNexus = NexusReader() sourceNexus.read_string(block) if "data" in sourceNexus.blocks: snpCol = _findDifferences(sourceNexus, destNexus, snpCol, recordAll, recordRandomSample) block = line else: block += line sourceNexus = NexusReader() sourceNexus.read_string(block) if "data" in sourceNexus.blocks: snpCol = _findDifferences(sourceNexus, destNexus, snpCol, recordAll, recordRandomSample) destFile.write(destNexus.make_nexus() + '\n') destFile.close() sourceFile.close()
def test_regression_format_string_has_quoted_symbols(self): """Regression: Symbols in the format string should be quoted""" nex = NexusWriter() for char, b in data.items(): for taxon, value in b.items(): nex.add(taxon, char, value) out = nex.make_nexus() assert 'SYMBOLS="123456"' in out
class Test_NexusWriter_2(unittest.TestCase): def setUp(self): self.nex = NexusWriter() for char, b in data.items(): for taxon, value in b.items(): self.nex.add(taxon, char, value) def test_nexus_noninterleave(self): """Test Nexus Generation - Non-Interleaved""" n = self.nex.make_nexus(interleave=False) assert re.search("#NEXUS", n) assert re.search("BEGIN DATA;", n) assert re.search("DIMENSIONS NTAX=3 NCHAR=2;", n) assert re.search("MATRIX", n) assert re.search("Latin\s+36", n) assert re.search("French\s+14", n) assert re.search("English\s+25", n) assert re.search("FORMAT.*MISSING\=(.+?)", n).groups()[0] == '?' assert re.search("FORMAT.*DATATYPE\=(\w+)\s", n).groups()[0] == 'STANDARD' assert re.search('FORMAT.*SYMBOLS\="(\d+)";', n).groups()[0] == '123456' def test_nexus_charblock(self): """Test Nexus Generation - with characters block""" n = self.nex.make_nexus(charblock=True) assert re.search("#NEXUS", n) assert re.search("BEGIN DATA;", n) assert re.search("DIMENSIONS NTAX=3 NCHAR=2;", n) assert re.search("CHARSTATELABELS", n) assert re.search("1 char1,", n) assert re.search("2 char2", n) assert re.search("MATRIX", n) assert re.search("Latin\s+36", n) assert re.search("French\s+14", n) assert re.search("English\s+25", n) assert re.search("FORMAT.*MISSING\=(.+?)", n).groups()[0] == '?' assert re.search("FORMAT.*DATATYPE\=(\w+)\s", n).groups()[0] == 'STANDARD' assert re.search('FORMAT.*SYMBOLS\="(\d+)";', n).groups()[0] == '123456' def test_nexus_interleave(self): """Test Nexus Generation - Interleaved""" n = self.nex.make_nexus(interleave=True) assert re.search("#NEXUS", n) assert re.search("BEGIN DATA;", n) assert re.search("DIMENSIONS NTAX=3 NCHAR=2;", n) assert re.search("MATRIX", n) # char1 assert re.search("Latin\s+3", n) assert re.search("French\s+1", n) assert re.search("English\s+2", n) # char2 assert re.search("Latin\s+6", n) assert re.search("French\s+4", n) assert re.search("English\s+5", n) assert re.search("FORMAT.*MISSING\=(.+?)", n).groups()[0] == '?' assert re.search("FORMAT.*DATATYPE\=(\w+)\s", n).groups()[0] == 'STANDARD' assert re.search("FORMAT.*(INTERLEAVE)", n).groups()[0] == 'INTERLEAVE' assert re.search('FORMAT.*SYMBOLS\="(\d+)";', n).groups()[0] == '123456'
def test_regression_format_string_has_datatype_first(self): """Regression: Format string should contain 'datatype' as the first element""" # SplitsTree complains otherwise. nex = NexusWriter() for char, b in data.items(): for taxon, value in b.items(): nex.add(taxon, char, value) out = nex.make_nexus() assert "FORMAT DATATYPE=STANDARD" in out
def combine_nexuses(*nexuslist): """ Combines a list of NexusReader instances into a single nexus :param nexuslist: A list of NexusReader instances :return: A NexusWriter instance """ if len(nexuslist) == 1 and isinstance(nexuslist[0], (list, tuple)): nexuslist = nexuslist[0] nexuslist = [get_nexus_reader(nex) for nex in nexuslist] out = NexusWriter() # check they're all nexus instances and get all block types blocks = set() for nex in nexuslist: blocks.update(list(nex.blocks)) for block in blocks: if block == 'data': out = combine_datablocks(out, nexuslist) elif block == 'trees': out = combine_treeblocks(out, nexuslist) else: # pragma: no cover raise ValueError("Don't know how to combine %s blocks" % block) return out
def nexus(args): # pragma: no cover usage = """ Convert a lexibank dataset to nexus. lexibank nexus <DATASET> --output=... """ get_dataset(args) parser = argparse.ArgumentParser(prog='nexus', usage=usage) parser.add_argument('--output', help='Nexus output file', default=None) xargs = parser.parse_args(args.args[1:]) writer = NexusWriter() if not xargs.output: print(writer.write()) else: writer.write_to_file(filename=xargs.output)
def binarise(nexus_obj, one_nexus_per_block=False): """ Returns a binary variant of the given `nexus_obj`. If `one_nexus_per_block` then we return a list of NexusWriter instances. :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :param one_nexus_per_block: Whether to return a single NexusWriter, or a list of NexusWriter's (one per character) :type one_nexus_per_block: Boolean :return: A NexusWriter instance or a list of NexusWriter instances. :raises AssertionError: if nexus_obj is not a nexus :raises NexusFormatException: if nexus_obj does not have a `data` block """ check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) nexuslist = [] n = NexusWriter() for i in sorted(nexus_obj.data.charlabels): label = nexus_obj.data.charlabels[i] # character label char = nexus_obj.data.characters[label] # character dict (taxon->state) recoding = _recode_to_binary(char) # recode new_char_length = len(recoding[recoding.keys()[0]]) # loop over recoded data for j in range(new_char_length): for taxon, state in recoding.items(): # make new label new_label = "%s_%d" % (str(label), j) # add to nexus n.add(taxon, new_label, state[j]) if one_nexus_per_block: nexuslist.append(n) n = NexusWriter() if one_nexus_per_block: return nexuslist else: return n
class Test_NexusWriter_1(unittest.TestCase): def setUp(self): self.nex = NexusWriter() def test_char_adding1(self): """Test Character Addition 1""" for tx, value in data['char1'].items(): self.nex.add(tx, 'char1', value) assert self.nex.data['char1']['French'] == '1' assert self.nex.data['char1']['English'] == '2' assert self.nex.data['char1']['Latin'] == '3' def test_char_adding2(self): """Test Character Addition 2""" for tx, value in data['char2'].items(): self.nex.add(tx, 'char2', value) assert self.nex.data['char2']['French'] == '4' assert self.nex.data['char2']['English'] == '5' assert self.nex.data['char2']['Latin'] == '6'
def shufflenexus(nexus_obj, resample=False): """ Shuffles the characters between each taxon to create a new nexus :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :param resample: The number of characters to resample. If set to False, then the number of characters will equal the number of characters in the original data file. :type resample: Integer :return: A shuffled NexusReader instance :raises AssertionError: if nexus_obj is not a nexus :raises ValueError: if resample is not False or a positive Integer :raises NexusFormatException: if nexus_obj does not have a `data` block """ check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) if resample is False: resample = nexus_obj.data.nchar try: resample = int(resample) except ValueError: raise ValueError('resample must be a positive integer or False!') if resample < 1: raise ValueError('resample must be a positive integer or False!') newnexus = NexusWriter() newnexus.add_comment("Randomised Nexus generated from %s" % nexus_obj.filename) for i in range(resample): # pick existing character character = randrange(0, nexus_obj.data.nchar) chars = nexus_obj.data.characters[character] site_values = [chars[taxon] for taxon in nexus_obj.data.taxa] shuffle(site_values) for taxon in nexus_obj.data.taxa: newnexus.add(taxon, i, site_values.pop(0)) return newnexus
def new_nexus_without_sites(nexus_obj, sites_to_remove): """ Returns a new NexusReader instance with the sites in `sites_to_remove` removed. :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :param sites_to_remove: A list of site numbers :type sites_to_remove: List :return: A NexusWriter instance :raises AssertionError: if nexus_obj is not a nexus :raises NexusFormatException: if nexus_obj does not have a `data` block """ check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) # make new nexus nexout = NexusWriter() nexout.add_comment( "Removed %d sites: %s" % (len(sites_to_remove), ",".join(["%s" % s for s in sites_to_remove]))) new_sitepos = 0 for sitepos in range(nexus_obj.data.nchar): if sitepos in sites_to_remove: continue # skip! for taxon, data in nexus_obj.data: nexout.add(taxon, new_sitepos, data[sitepos]) new_sitepos += 1 return nexout
def create_nexus(bt_matrix, sample_list): ''' create NexusWriter object from BedTool matrix BedTool object apparently can contain empty lines, a simple length-check skips these requires pybedtools, nexus ''' n = NexusWriter() if isinstance(bt_matrix, BedTool): matrix_lines = str(bt_matrix).split( "\n") # create lines from BedTool object else: matrix_lines = bt_matrix i = 0 current_chromosome = "" for line in matrix_lines: line = line.split("\t") if len(line) < 4: #check for empty line, why 4? it works print "skipping: line empty: %s" % ",".join(line) continue chrom, start, end, samples_present = line[0], line[1], line[2], line[ 3].split(",") # create locus and presnt_ssample objects locus = "%s:%i - %i" % (chrom, int(start), int(end)) # define locus i += 1 if chrom != current_chromosome: print chrom current_chromosome = chrom else: pass for taxon in sample_list: # iterate over taxons presence = 1 if taxon in samples_present else 0 n.add(taxon, locus, presence) return n
def make_nexus(model_dict): sequence = [] n = NexusWriter() dictline = {} for dictline in model_dict: lang = dictline.get('Language') isocode = dictline.get('ISOCODE') kvec = "" for k in sorted(dictline): #print(k) if (k == 'Language' or k == 'ISOCODE'): continue else: k = repr(dictline.get(k)) krem = k.replace("'", "") kmod = krem[2:36] kvec = kvec + kmod.zfill(35) # label is number of elements in vector, number of taxa and a random 1 label = "%s_%s_%d" % ("756", "51", 1) n.add(lang, label, kvec) return n
def binarise(nexus_obj, one_nexus_per_block=False): """ Returns a binary variant of the given `nexus_obj`. If `one_nexus_per_block` then we return a list of NexusWriter instances. :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :param one_nexus_per_block: Whether to return a single NexusWriter, or a list of NexusWriter's (one per character) :type one_nexus_per_block: Boolean :return: A NexusWriter instance or a list of NexusWriter instances. :raises AssertionError: if nexus_obj is not a nexus :raises NexusFormatException: if nexus_obj does not have a `data` block """ check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) nexuslist = [] n = NexusWriter() for i in sorted(nexus_obj.data.charlabels): label = nexus_obj.data.charlabels[i] # character label char = nexus_obj.data.characters[ label] # character dict (taxon->state) recoding = _recode_to_binary(char) # recode new_char_length = len(recoding[recoding.keys()[0]]) # loop over recoded data for j in range(new_char_length): for taxon, state in recoding.items(): # make new label new_label = "%s_%d" % (str(label), j) # add to nexus n.add(taxon, new_label, state[j]) if one_nexus_per_block: nexuslist.append(n) n = NexusWriter() if one_nexus_per_block: return nexuslist else: return n
def multistatise(nexus_obj): """ Returns a multistate variant of the given `nexus_obj`. :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :return: A NexusReader instance :raises AssertionError: if nexus_obj is not a nexus :raises NexusFormatException: if nexus_obj does not have a `data` block """ check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) site_idx = 0 nexout = NexusWriter() missing = [] charlabel = getattr(nexus_obj, 'short_filename', 1) for site, data in nexus_obj.data.characters.items(): multistate_value = chr(65 + site_idx) for taxon, value in data.items(): assert value == str(value) if value in ('?', '-'): missing.append(taxon) if value == '1': nexout.add(taxon, charlabel, multistate_value) if taxon in missing: # remove taxon if we've seen a non-? entry missing.remove(taxon) site_idx += 1 assert site_idx < 26, "Too many characters to handle! - run out of A-Z" # add missing state for anything that is all missing, and has not been # observed anywhere for taxon in nexus_obj.data.taxa: if taxon not in nexout.data[str(charlabel)]: nexout.add(taxon, charlabel, '?') return nexout._convert_to_reader()
def combine_nexuses(nexuslist): """ Combines a list of NexusReader instances into a single nexus :param nexuslist: A list of NexusReader instances :type nexuslist: List :return: A NexusWriter instance :raises TypeError: if nexuslist is not a list of NexusReader instances :raises IOError: if unable to read an file in nexuslist :raises NexusFormatException: if a nexus file does not have a `data` block """ out = NexusWriter() charpos = 0 for nex_id, nex in enumerate(nexuslist, 1): check_for_valid_NexusReader(nex, required_blocks=['data']) if hasattr(nex, 'short_filename'): nexus_label = os.path.splitext(nex.short_filename)[0] elif hasattr(nex, 'label'): nexus_label = nex.label else: nexus_label = str(nex_id) out.add_comment("%d - %d: %s" % (charpos, charpos + nex.data.nchar - 1, nexus_label)) for site_idx, site in enumerate(sorted(nex.data.characters), 0): data = nex.data.characters.get(site) charpos += 1 # work out character label charlabel = nex.data.charlabels.get(site_idx, site_idx + 1) label = '%s.%s' % (nexus_label, charlabel) for taxon, value in data.items(): out.add(taxon, label, value) return out
def setUp(self): self.nex = NexusWriter()
def create_nexus(bt_matrix, sample_list, absence_code=0, has_missing=False): ''' create NexusWriter object from BedTool matrix BedTool object apparently can contain empty lines, a simple length-check skips these :param has_missing indicate whether input BED file should be parsed for uncertainly called loci, that will be coded as "?" requires pybedtools, nexus ''' from nexus import NexusWriter n = NexusWriter() # if isinstance(bt_matrix, BedTool): # matrix_lines = str(bt_matrix).split("\n") # create lines from BedTool object # else: # matrix_lines = str(bt_matrix).split("\n") matrix_lines = str(bt_matrix).split("\n") current_chrom = "" if has_missing: print u"[ {} ] adding missing characters".format(modulename) for line in matrix_lines: line = line.split("\t") if len(line) < 4: # check for incomplete line print u"[ {} ]".format(modulename), print u"skipping incomplete or empty line: %s" % ",".join(line) continue if not has_missing: chrom, start, end, samples_present = line[0], line[1], line[ 2], line[3].split( ",") # create locus and present_sample objects locus = "%s_%i_%i" % (chrom, int(start), int(end)) # define locus if current_chrom != chrom: print u"[ {} ] Operating on".format(modulename), print chrom current_chrom = chrom # add locus to nexus object for taxon in sample_list: # iterate over taxons char = 1 if taxon in samples_present else absence_code n.add(taxon, locus, char) elif has_missing: chrom, start, end, samples_present, type, samples_missing = line[ 0], line[1], line[2], line[3].split( ","), line[4], line[5].split( ",") # create locus and present_sample objects locus = "%s_%i_%i" % (chrom, int(start), int(end)) # define locus if current_chrom != chrom: print u"[ {} ] Operating on".format(modulename), print chrom current_chrom = chrom # add locus to nexus object for taxon in sample_list: # iterate over taxons if taxon in samples_present: char = 1 elif taxon in samples_missing: char = "?" else: char = absence_code n.add(taxon, locus, char) return n
def setUp(self): self.nex = NexusWriter() for char in data: for taxon, value in data[char].items(): self.nex.add(taxon, char, value)
def setUp(self): self.nex = NexusWriter() for char, b in data.items(): for taxon, value in b.items(): self.nex.add(taxon, char, value)
class Test_NexusWriter(unittest.TestCase): def setUp(self): self.nex = NexusWriter() for char in data: for taxon, value in data[char].items(): self.nex.add(taxon, char, value) def test_char_adding1(self): """Test Character Addition 1""" assert self.nex.data['char1']['French'] == '1' assert self.nex.data['char1']['English'] == '2' assert self.nex.data['char1']['Latin'] == '3' def test_char_adding2(self): """Test Character Addition 2""" assert self.nex.data['char2']['French'] == '4' assert self.nex.data['char2']['English'] == '5' assert self.nex.data['char2']['Latin'] == '6' def test_char_adding_integer(self): """Test Character Addition as integer""" self.nex.add('French', 'char3', 9) self.nex.add('English', 'char3', '9') assert self.nex.data['char3']['French'] == '9' assert self.nex.data['char3']['French'] == '9' def test_characters(self): assert 'char1' in self.nex.characters assert 'char2' in self.nex.characters def test_taxa(self): assert 'French' in self.nex.taxa assert 'English' in self.nex.taxa assert 'Latin' in self.nex.taxa def test_remove(self): self.nex.remove("French", "char2") assert 'French' not in self.nex.data['char2'] assert 'French' in self.nex.taxa def test_remove_character(self): self.nex.remove_character("char2") assert len(self.nex.characters) == 1 assert 'char2' not in self.nex.data def test_remove_taxon(self): self.nex.remove_taxon("French") assert 'French' not in self.nex.taxa for char in self.nex.data: assert 'French' not in self.nex.data[char] n = self.nex.make_nexus(interleave=False) assert re.search("DIMENSIONS NTAX=2 NCHAR=2;", n) assert 'French' not in n def test_nexus_noninterleave(self): """Test Nexus Generation - Non-Interleaved""" n = self.nex.make_nexus(interleave=False) assert re.search("#NEXUS", n) assert re.search("BEGIN DATA;", n) assert re.search("DIMENSIONS NTAX=3 NCHAR=2;", n) assert re.search("MATRIX", n) assert re.search("Latin\s+36", n) assert re.search("French\s+14", n) assert re.search("English\s+25", n) assert re.search("FORMAT.*MISSING\=(.+?)", n).groups()[0] == '?' assert re.search("FORMAT.*DATATYPE\=(\w+)\s", n).groups()[0] \ == 'STANDARD' assert re.search('FORMAT.*SYMBOLS\="(\d+)";', n).groups()[0] \ == '123456' def test_nexus_charblock(self): """Test Nexus Generation - with characters block""" n = self.nex.make_nexus(charblock=True) assert re.search("#NEXUS", n) assert re.search("BEGIN DATA;", n) assert re.search("DIMENSIONS NTAX=3 NCHAR=2;", n) assert re.search("CHARSTATELABELS", n) assert re.search("1 char1,", n) assert re.search("2 char2", n) assert re.search("MATRIX", n) assert re.search("Latin\s+36", n) assert re.search("French\s+14", n) assert re.search("English\s+25", n) assert re.search("FORMAT.*MISSING\=(.+?)", n).groups()[0] == '?' assert re.search("FORMAT.*DATATYPE\=(\w+)\s", n).groups()[0] \ == 'STANDARD' assert re.search('FORMAT.*SYMBOLS\="(\d+)";', n).groups()[0] \ == '123456' def test_nexus_interleave(self): """Test Nexus Generation - Interleaved""" n = self.nex.make_nexus(interleave=True) assert re.search("#NEXUS", n) assert re.search("BEGIN DATA;", n) assert re.search("DIMENSIONS NTAX=3 NCHAR=2;", n) assert re.search("MATRIX", n) # char1 assert re.search("Latin\s+3", n) assert re.search("French\s+1", n) assert re.search("English\s+2", n) # char2 assert re.search("Latin\s+6", n) assert re.search("French\s+4", n) assert re.search("English\s+5", n) assert re.search("FORMAT.*MISSING\=(.+?)", n).groups()[0] == '?' assert re.search("FORMAT.*DATATYPE\=(\w+)\s", n).groups()[0] == \ 'STANDARD' assert re.search("FORMAT.*(INTERLEAVE)", n).groups()[0] == \ 'INTERLEAVE' assert re.search('FORMAT.*SYMBOLS\="(\d+)";', n).groups()[0] == \ '123456' def test_polymorphic_characters(self): self.nex.add("French", "char1", 2) self.assertEqual(self.nex.data['char1']['French'], "12") n = self.nex.make_nexus(charblock=True) assert re.search("DIMENSIONS NTAX=3 NCHAR=2;", n) # no change assert re.search("French\s+\(12\)4", n) def test_write_to_file(self): tmp = NamedTemporaryFile(delete=False, suffix=".nex") tmp.close() self.nex.write_to_file(tmp.name) assert os.path.isfile(tmp.name) with open(tmp.name, 'r') as handle: n = handle.read() assert re.search("#NEXUS", n) assert re.search("BEGIN DATA;", n) assert re.search("DIMENSIONS NTAX=3 NCHAR=2;", n) assert re.search("MATRIX", n) assert re.search("Latin\s+36", n) assert re.search("French\s+14", n) assert re.search("English\s+25", n) assert re.search("FORMAT.*MISSING\=(.+?)", n).groups()[0] == '?' assert re.search("FORMAT.*DATATYPE\=(\w+)\s", n).groups()[0] \ == 'STANDARD' assert re.search('FORMAT.*SYMBOLS\="(\d+)";', n).groups()[0] \ == '123456' os.unlink(tmp.name) # cleanup def test_write_as_table(self): content = self.nex.write_as_table() assert re.search("Latin\s+36", content) assert re.search("French\s+14", content) assert re.search("English\s+25", content) assert len(content.split("\n")) == 3 def test_write_as_table_with_polymorphoc(self): self.nex.add('French', 'char1', '2') content = self.nex.write_as_table() assert re.search("Latin\s+36", content) assert re.search("French\s+\(12\)4", content) assert re.search("English\s+25", content) assert len(content.split("\n")) == 3
def read_file(self): with open( self.input_file, 'r') as f: self.first_line = f.readline() # TODO: Needs work xread / tnt file format is loose # xread has some other potential clues xread_filename = f.readline().strip().replace("'", "") xread_matrix_dimensions = f.readline() lines = f.readlines() f.close if "#NEXUS" == self.first_line.strip(): print "text file is nexus" # move to nexus folder filename, file_extension = os.path.splitext(self.input_file) os.rename(self.input_file, filename + ".nex") # send to correct matrix handler # TODO: A little dirty, probably should # change mediator's handler param so this flows cleaner matrixHandler = NexusHandler( filename + ".nex" ) return matrixHandler.read_file() elif "xread" == self.first_line.strip(): print "xread format found" dimensions = str(xread_matrix_dimensions).split(' ') self.ncols = int(dimensions[0]) self.nrows = int(dimensions[1]) custom_block = "\n\nBEGIN VERIFIED_TAXA;\n" custom_block += "Dimensions ntax=" + str(self.nrows) + " nchar=4;\n" matrix = "" matrix_arr = [] line_buffer = "" row_taxa = [] for l in lines: if ";proc/;" != l.strip(): if line_buffer: line_row = line_buffer + " " + l.strip() else: line_row = l.strip() if len(line_row) >= self.ncols: # reconstitute broken rows, then remove space/tabbing line_parts = line_row.split(' ') line_parts = list(filter(None, line_parts)) taxon_name = line_parts[0] taxon_chars = line_parts[1] #taxon_chars = line_parts[1].replace("[", "(") #taxon_chars = taxon_chars.replace("]", ")") # verify taxa verified_taxa = verifyTaxa(taxon_name) verified_name = None if verified_taxa: for taxa in verified_taxa: # We split here to exclude the odd citation on the taxon name ( maybe regex what looks like name & name, year would be better ) verified_name = taxa['name_string'].lower().split(' ') row_taxa.append( verified_name[0] ) custom_block += taxon_name + " " + taxa['name_string'] + " " + taxa['match_value'] + " " + taxa['datasource'] + "\n" matrix += " " + verified_name[0] + " " + taxon_chars.strip() + "\n" matrix_arr.append(taxon_chars.strip()) else: row_taxa.append( taxon_name ) custom_block += taxon_name + "\n" matrix += " " + taxon_name + " " + taxon_chars.strip() + "\n" matrix_arr.append(taxon_chars.strip()) line_buffer = "" else: line_buffer += l.strip() custom_block += ";\n" custom_block += "END;\n" self.custom_block = custom_block print "matrix array" marr = [] for row in matrix_arr: items = list(row) marr.append(items) m = numpy.matrix(marr) nw = NexusWriter() nw.add_comment("Morphobank generated Nexus from xread .txt file ") for rx in range(self.nrows): taxon_name = row_taxa[rx] cell_value = m.item(rx) for cindex, cv in enumerate(cell_value): char_no = cindex + 1 nw.add(taxon_name, char_no, cv) # keep the upload path and filename, but change extension file_parts = self.input_file.split('.') output_file = file_parts[0] + '.nex' nw.write_to_file(filename=output_file, interleave=False, charblock=True) # move to nexus folder #os.rename(xread_filename + ".nex", "./nexus/" + xread_filename + ".nex") # wait for file to move before open and append #while not os.path.exists('./nexus/' + xread_filename + '.nex'): # time.sleep(1) #if os.path.isfile('./nexus/' + xread_filename + '.nex'): # Custom Block Section nexus_file = codecs.open(output_file, 'a', 'utf-8') nexus_file.write(custom_block) nexus_file.close() return output_file else: print "do not know how to process this .txt file"
def read_file(self): book = xlrd.open_workbook(self.input_file) sh = book.sheet_by_index(0) nw = NexusWriter() self.nrows = sh.nrows self.ncols = sh.ncols nw.add_comment("Morphobank generated Nexus File") custom_block = "\n\nBEGIN VERIFIED_TAXA;\n" custom_block += "Dimensions ntax=" + str(self.nrows) + " nchar=4;\n" # Species List Taxa for rx in range(self.nrows): if rx: taxon_name = str(sh.cell_value(rowx=rx, colx=0)).strip() verified_taxa = verifyTaxa(taxon_name) verified_name = None if verified_taxa: for taxa in verified_taxa: verified_name = taxa['name_string'].lower() custom_block += taxon_name + " " + taxa['name_string'] + " " + taxa['match_value'] + " " + taxa['datasource'] + "\n" else: custom_block += taxon_name + "\n" for cx in range(self.ncols): if cx: if is_number( sh.cell_value(rowx=rx, colx=cx)): cell_value = int(sh.cell_value(rowx=rx, colx=cx)) else: cell_value = sh.cell_value(rowx=rx, colx=cx) cell_value = cell_value.replace("{", "(") cell_value = cell_value.replace("}", ")") if verified_name: nw.add(verified_name, cx, cell_value) else: nw.add(taxon_name, cx, cell_value) custom_block += ";\n" custom_block += "END;\n" self.custom_block = custom_block # keep the upload path and filename, but change extension file_parts = self.input_file.split('.') output_file = file_parts[0] + '.nex' nw.write_to_file(filename=file_parts[0] + '.nex', interleave=False, charblock=True) ## quick append Custom Block nexus_file = open(output_file, 'a') nexus_file.write(custom_block) nexus_file.close() return output_file