def test_remove_common_words(self): """ Test removing common words """ d = "This is the sentence of words to process" words = NRDatabaseFilter.remove_common_words(d.split()) self.assertTrue(not "the" in words) self.assertTrue(not "of" in words) self.assertTrue("sentence" in words) self.assertTrue("This" in words)
def test_filtering(self): """ test the filtering of the NR database """ f = open(os.path.join(self.datadir, "coglist13962_10-oct-2012.txt"), "rU") # rU read with universal new line reader = csv.reader(f, delimiter="\t") reader.next() # discard title line i = 0 cogs_names = [] cogs_ids = [] for row in reader: cogs_ids.append(row[0]) cogs_names.append(row[1].lower()) i += 1 if i > 5: break f.close() log.debug("Filtering COGS %s", cogs_ids) log.debug("Descriptions: %s", cogs_names) fn_database = os.path.join(self.datadir, "mini_nr", "nr_test2") dbfilter = NRDatabaseFilter.NRDatabaseFilter(fn_database) dbfilter.set_ids(cogs_ids) dbfilter.set_descriptions(cogs_names) dbfilter.do_filtering(overwrite=True) for cog_id, name in zip(cogs_ids, cogs_names): fn = dbfilter.get_database_name(cog_id) if os.path.exists(fn): # check that the file contains only sequences that # match the description keywords = name.lower().split() for seq_record in SeqIO.parse(fn, "fasta"): words = seq_record.description.lower().split(" ") self.assertTrue( NRDatabaseFilter.contains_all_keywords(words,keywords), "There is a sequence in file {0} that does not have " \ "the right kewords {1}".format(fn,str(keywords))) map(os.remove, dbfilter.get_database_files_created())
def test_filtering(self): """ test the filtering of the NR database """ f = open(os.path.join(self.datadir,"coglist13962_10-oct-2012.txt"),"rU") # rU read with universal new line reader = csv.reader(f, delimiter="\t") reader.next() # discard title line i = 0 cogs_names = [] cogs_ids = [] for row in reader: cogs_ids.append(row[0]) cogs_names.append(row[1].lower()) i += 1 if i > 5: break f.close() log.debug("Filtering COGS %s",cogs_ids) log.debug("Descriptions: %s",cogs_names) fn_database = os.path.join(self.datadir,"mini_nr", "nr_test2") dbfilter = NRDatabaseFilter.NRDatabaseFilter(fn_database) dbfilter.set_ids(cogs_ids) dbfilter.set_descriptions(cogs_names) dbfilter.do_filtering(overwrite=True) for cog_id, name in zip(cogs_ids, cogs_names): fn = dbfilter.get_database_name(cog_id) if os.path.exists(fn): # check that the file contains only sequences that # match the description keywords = name.lower().split() for seq_record in SeqIO.parse(fn, "fasta"): words = seq_record.description.lower().split(" ") self.assertTrue( NRDatabaseFilter.contains_all_keywords(words,keywords), "There is a sequence in file {0} that does not have " \ "the right kewords {1}".format(fn,str(keywords))) map(os.remove, dbfilter.get_database_files_created())
def test_contains_all_keywords(self): """ Test contains all keywords """ words = "tRNA amylase somethig protein".split() keywords = "amylase protein".split() self.assertTrue(NRDatabaseFilter.contains_all_keywords(words,keywords))
help="File containing COGS and their annotations as provided by IMG/M") parser.add_argument("fn_nr", help="File of the NR NCBI database") parser.add_argument("--log", dest="log", default = False, help="Log file") args = parser.parse_args() if(args.log): logging.basicConfig(filename=args.log, filemode="w") else: logging.basicConfig(stream=sys.stdout) logging.root.setLevel(logging.INFO) f = open(args.fn_cogs,"rU") reader = csv.reader(f, delimiter="\t") reader.next() # ignore title line i = 0 cogs_names = [] cogs_ids = [] for row in reader: cogs_ids.append(row[0]) cogs_names.append(row[1].lower()) f.close() nr_filter = NRDatabaseFilter.NRDatabaseFilter(args.fn_nr) log.info("Cogs names %s",cogs_names) log.info("Cogs ids %s",cogs_ids) nr_filter.set_descriptions(cogs_names) nr_filter.set_ids(cogs_ids) nr_filter.do_filtering()
def test_contains_all_keywords(self): """ Test contains all keywords """ words = "tRNA amylase somethig protein".split() keywords = "amylase protein".split() self.assertTrue(NRDatabaseFilter.contains_all_keywords( words, keywords))