def run_tarsqi(args): """Main method that is called when the script is executed from the command line. It creates a Tarsqi instance and lets it process the input. If the input is a directory, this method will iterate over the contents, setting up TrasqiControlInstances for all files in the directory. The arguments are the list of arguments given by the user on the command line. There is no return value.""" (opts, args) = _read_arguments(args) if len(args) < 2: raise TarsqiError("missing input or output arguments\n%s" % _usage_string()) # Use os.path.abspath here because some components change the working # directory and when some component fails the cwd may not be reset to the # root directory inpath = os.path.abspath(args[0]) outpath = os.path.abspath(args[1]) t0 = time.time() if os.path.isdir(inpath) and os.path.isdir(outpath): for file in os.listdir(inpath): infile = inpath + os.sep + file outfile = outpath + os.sep + file if os.path.isfile(infile): print infile Tarsqi(opts, infile, outfile).process() elif os.path.isfile(inpath): if os.path.exists(outpath): raise TarsqiError('output file ' + outpath + ' already exists') Tarsqi(opts, inpath, outpath).process() else: raise TarsqiError('Invalid input and/or output options') logger.info("TOTAL PROCESSING TIME: %.3f seconds" % (time.time() - t0))
def run_tarsqi(args): """Main method that is called when the script is executed. It creates a TarsqiControl instance and lets it process the input. If the input is a directory, this method will iterate over the contents, setting up TrasqiControlInstances for all files in the directory. The arguments are the list of arguments given by the user on the command line. There is no return value.""" (input_type, opts, input, output) = read_arguments(args) begin_time = time.time() if os.path.isdir(input) and os.path.isdir(output): for file in os.listdir(input): infile = input + os.sep + file outfile = output + os.sep + file if os.path.isfile(infile): print infile TarsqiControl(input_type, opts, infile, outfile).process() elif os.path.isfile(input): if os.path.exists(output): sys.exit('ERROR: output file ' + output + ' already exists') TarsqiControl(input_type, opts, input, output).process() else: sys.exit('Invalid input and/or output parameters') end_time = time.time() logger.info("TOTAL PROCESSING TIME: %.3f seconds" % (end_time - begin_time))
def get_stock_data(self, stock_name): """ Return a dataframe of that stock and normalize all the values. (Optional: create moving average) """ logger.info("Loading Stock [%s]...", stock_name) df = quandl.get_table('WIKI/PRICES', ticker=stock_name, paginate=True) df.drop([ 'ticker', 'open', 'high', 'low', 'close', 'ex-dividend', 'volume', 'split_ratio' ], 1, inplace=True) df.set_index('date', inplace=True) # Renaming all the columns so that we can use the old version code df.rename(columns={ 'adj_open': 'Open', 'adj_high': 'High', 'adj_low': 'Low', 'adj_volume': 'Volume', 'adj_close': HeaderFactory.Price }, inplace=True) df.sort_index(ascending=True, inplace=True) df.dropna(inplace=True) return df
def run_tarsqi(args): """Main method that is called when the script is executed. It creates a TarsqiControl instance and lets it process the input. If the input is a directory, this method will iterate over the contents, setting up TrasqiControlInstances for all files in the directory. The arguments are the list of arguments given by the user on the command line. There is no return value.""" (input_type, opts, input, output) = read_arguments(args) begin_time = time.time() if os.path.isdir(input) and os.path.isdir(output): for file in os.listdir(input): infile = input + os.sep + file outfile = output + os.sep + file if os.path.isfile(infile): print infile TarsqiControl(input_type, opts, infile, outfile).process() elif os.path.isfile(input): # if os.path.exists(output): # sys.exit('ERROR: output file ' + output + ' already exists') TarsqiControl(input_type, opts, input, output).process() else: sys.exit('Invalid input and/or output parameters') end_time = time.time() logger.info("TOTAL PROCESSING TIME: %.3f seconds" % (end_time - begin_time))
def process_string(self, input_string): """Similar to process(), except that it runs on an input string and not on a file, it does not write the output to a file and it returns the TarsqiDocument.""" logger.info(input_string) self.document = self.source_parser.parse_string(input_string) self._process_document() return self.document
def __iter__(self): logger.info("Loading %s...", self.data_path) for (root, dir_names, files) in walk(self.data_path): for name in files: file_name = path.join(root, name) data = self.source.get_vector(file_name) yield data, name
def _log_duration(duration, file_name, length): template = 'Preprocessed {} of length {} in {} seconds ' \ '({} char per second)' message = template.format(file_name, length, duration, length/duration) logger.info(message)
def apply_component(self, name, wrapper, infile, outfile): """Apply a component if the processing parameters determine that the component needs to be applied. This method passes the content tag and the xml_document to the wrapper of the component and asks the wrapper to process the document fragments. Component-level errors are trapped here if trap_errors is True. Arguments: name - string, the name of the component wrapper - instance of a subclass of ComponentWrapper infile - string outfile - string Return value: None""" # NOTES # - Components still write results to file, which is not # conform to the specs. But writing files to disk is but a # minor part of processing time so for now we'll leave it # here and let all components assume that there is an input # file to work with. # - Having said that, it is not quite true that the wrappers # use the input file. The wrappers use the xml document and # the content tag and then (i) create fragments from the xml # doc, (ii) process the fragments, (iii) reinsert the # fragments in the xml doc, and (iv) write the xml doc to a # file. But the file rated is not opened by the next # wrapper. # - Errors are now trapped here instead of in the component # since we do not tell the component what the output file # is. def call_wrapper(wrapper, content_tag, xmldoc, trap_errors, outfile): wrapper(content_tag, xmldoc, self).process() self.xml_document.save_to_file(outfile) logger.info("RUNNING " + name + " on: " + infile) #logger.out('Running', name) trap_errors = self.getopt_trap_errors() if trap_errors: try: call_wrapper(wrapper, self.content_tag, self.xml_document, trap_errors, outfile) except: logger.error(name + " error on " + infile + "\n\t" + str(sys.exc_type) + "\n\t" + str(sys.exc_value) + "\n") shutil.copy(infile, outfile) else: call_wrapper(wrapper, self.content_tag, self.xml_document, trap_errors, outfile)
def count_occurences(y): if len(y.shape) > 1 and y.shape[1] > 1: for current in range(0, y.shape[1]): total = sum(y[:, current]) logger.info("Found %s type with %i records", current, total) else: types = np.unique(y) for current in types: total = sum(y == current) logger.info("Found %s type with %i records", current, total)
def process_string(self, input_string): """Similar to process(), except that it runs on an input string and not on a file, it does not write the output to a file and it returns the TarsqiDocument.""" logger.info(input_string[:75].replace('\n', ' ')) self.source_parser.parse_string(input_string, self.tarsqidoc) self.metadata_parser.parse(self.tarsqidoc) self.docstructure_parser.parse(self.tarsqidoc) for (name, wrapper) in self.pipeline: self._apply_component(name, wrapper, self.tarsqidoc) return self.tarsqidoc
def process(self): """Parse the source with the source parser, the metadata parser and the document structure parser, apply all components and write the results to a file. The actual processing itself is driven using the processing options set at initialization. Components are given the TarsqiDocument and update it.""" if not self._skip_file(): self._cleanup_directories() logger.info(self.input) self.document = self.source_parser.parse_file(self.input) self._process_document() self._write_output()
def get_stock_data(self, stock_name): file_path = path.join(Constants.DATASETS_MARKET, 'reddit/DJIA_table.csv') logger.info("Loading [%s]...", file_path) market_data = pd.read_csv(file_path, na_values=['nan']) # drop unadjusted close market_data.Date = pd.to_datetime(market_data.Date, format='%Y-%m-%d') market_data.set_index('Date', inplace=True) market_data.reindex() market_data.sort_index(ascending=True, inplace=True) market_data.drop(labels=[HeaderFactory.Price], axis=1, inplace=True) market_data.rename(columns={"Adj Close": HeaderFactory.Price}, inplace=True) market_data.dropna(inplace=True) return market_data
def feed_subscription_url_from_xml(fname): if not os.path.exists(fname): logger.warn('%s does not exist' % fname) return counter = 0 with open_workbook(fname) as wb: s = wb.sheet_by_index(0) for row in range(1, s.nrows): Subscription.get_or_create( index_url=s.cell(row, 1).value ) counter += 1 logger.info('%s index_url(s) has been inserted' % counter) return
def process_document(self): """Parse the source with the source parser, the metadata parser and the document structure parser, apply all components and write the results to a file. The actual processing itself is driven using the processing options set at initialization. Components are given the TarsqiDocument and update it.""" self._cleanup_directories() logger.info(self.input) logger.info("Source type is '%s'" % self.options.source) self.source_parser.parse_file(self.input, self.tarsqidoc) self.metadata_parser.parse(self.tarsqidoc) self.docstructure_parser.parse(self.tarsqidoc) for (name, wrapper) in self.pipeline: self._apply_component(name, wrapper, self.tarsqidoc) self._write_output()
def get_stock_data(self, stock_name: str): file_path = path.join(Constants.DATASETS_MARKET, 'stock/{}.csv'.format(stock_name)) logger.info("Loading [%s]...", file_path) market_data = pd.read_csv(file_path, na_values=['nan']) # drop unadjusted close market_data.Date = pd.to_datetime(market_data.Date, format='%Y-%m-%d') market_data.set_index('Date', inplace=True) market_data.reindex() market_data.sort_index(ascending=True, inplace=True) if 'curncy' in stock_name.lower(): market_data.drop(labels=["PX_VOLUME"], axis=1, inplace=True) market_data.rename(columns={'PX_OPEN': 'Open', 'PX_HIGH': 'High', 'PX_LOW': 'Low', 'PX_VOLUME': 'Volume', 'PX_LAST': HeaderFactory.Price}, inplace=True) # market_data.dropna(inplace=True) return market_data
def measure_performance_auc(test_y, result_y, result_y_prob): try: vacc = metrics.accuracy_score(test_y, result_y) # find validation AUC if len(np.unique(test_y)) == 2: vauc = roc_auc_score(test_y, result_y_prob) logger.info('Accurary: {0:.3f} and AUC {1:.3f}'.format( vacc, vauc)) else: vauc = None logger.info('Accurary: {0:.3f}'.format(vacc)) return vacc, vauc except: logger.error("Error calculating metrics")
def process_training(self): """This is the method that is called from the TarsqiControl class. Fragments are created and processed. It's a training version of process(self). Instead of using process_fragments inside the individual component, it uses process_training_fragments instead. """ self.create_fragments(self.tag, 'fragment') begin_time = time.time() self.process_training_fragments() end_time = time.time() total_time = end_time - begin_time logger.info("%s DONE, TRAINING processing time was %.3f seconds" % (self.component_name, total_time)) self.retrieve_fragments('fragment')
def __iter__(self): with io.open(self.file_name, 'rt', encoding='utf8') as csv_file: logger.info('Loading: %s', self.file_name) for line in csv_file: row = re.split(r'\t+', line) review_id = row[0] total_rows = len(row) if total_rows >= 3: type_class = self.convertor.is_supported(row[total_rows - 2]) if type_class is not None: text = row[total_rows - 1] vector = self.source.get_vector_from_review(text) if vector is not None: yield type_class, review_id, vector else: logger.warn("Vector not found: %s", text)
def process(self): """This is the method that is called from the TarsqiControl class. Fragments are created, processed and retrieved. The method that processes fragments (process_fragment) should be defined for each wrapper individually. No arguments and no return value.""" self.create_fragments(self.tag, 'fragment') begin_time = time.time() self.process_fragments() end_time = time.time() total_time = end_time - begin_time logger.info("%s DONE, processing time was %.3f seconds" % (self.component_name, total_time)) self.retrieve_fragments('fragment')
def run(self): """Main method that is called when the script is executed from the command line. It creates a Tarsqi instance and lets it process the input. If the input is a directory, this method will iterate over the contents, setting up Tarsqi instances for all files in the directory. The arguments are the list of arguments given by the user on the command line.""" t0 = time.time() if self.inpath is None and self.outpath is None: self._run_tarsqi_on_pipe() elif os.path.isdir(self.inpath): self._run_tarsqi_on_directory() elif os.path.isfile(self.inpath): self._run_tarsqi_on_file() else: raise TarsqiError('Invalid input') logger.info("TOTAL PROCESSING TIME: %.3f seconds" % (time.time() - t0)) logger.report()
def process(args): """ Run processing step """ start = time.time() alpha_files = path.iter_files_in(args.preprocessed_dir) beta_files = path.iter_files_in(args.preprocessed_dir) if args.parallel: cnt = process_parallel(args, alpha_files, beta_files) else: cnt = process_serial(args, alpha_files, beta_files) duration = time.time() - start if duration == 0: duration = 1 comparisons_per_sec = cnt / duration logger.info('Processed {} files per second'.format(comparisons_per_sec))
def _apply_component(self, name, wrapper, tarsqidocument): """Apply a component by taking the TarsqDocument, which includes the options from the Tarsqi instance, and passing it to the component wrapper. Component-level errors are trapped here if --trap-errors is True. If errors are trapped, it is still possible that partial results were written to the TagRepositories in the TarsqiDocument.""" logger.info(name + '............') t1 = time.time() if self.options.trap_errors: try: wrapper(tarsqidocument).process() except: logger.error("%s error:\n\t%s\n\t%s\n" % (name, sys.exc_type, sys.exc_value)) else: wrapper(tarsqidocument).process() logger.info("%s DONE (%.3f seconds)" % (name, time.time() - t1))
def process(args): """ Run processing step """ start = time.time() alpha_files = path.iter_files_in(args.preprocessed_dir) beta_files = path.iter_files_in(args.preprocessed_dir) if args.parallel: cnt = process_parallel(args, alpha_files, beta_files) else: cnt = process_serial(args, alpha_files, beta_files) duration = time.time() - start if duration == 0: duration = 1 comparisons_per_sec = cnt/duration logger.info('Processed {} files per second'.format(comparisons_per_sec))
def run(self): """Main method that is called when the script is executed from the command line. It creates a Tarsqi instance and lets it process the input. If the input is a directory, this method will iterate over the contents, setting up Tarsqi instances for all files in the directory. The arguments are the list of arguments given by the user on the command line.""" t0 = time.time() if self.inpath is None and self.outpath is None: self._run_tarsqi_on_pipe() elif os.path.isdir(self.inpath): self._run_tarsqi_on_directory() elif os.path.isfile(self.inpath): self._run_tarsqi_on_file() else: raise TarsqiError('Invalid input') logger.info("TOTAL PROCESSING TIME: %.3f seconds" % (time.time() - t0)) logger.report(sys.stderr)
def save(self, filename=None): """saves an unfinished game to disk""" if not os.path.exists(full_saved_games_dir): os.makedirs(full_saved_games_dir) self.filename = filename if filename else self.filename if self.filename else generate_file_name( ) logger.info("Saving game to file \"{}\"...".format(self.filename)) game_data = { "board_type": self.board_type, "board": self.board, "name": self.filename } with gzip.open( os.path.join(full_saved_games_dir, "{}.p".format(filename)), "wb") as f: f.write(pickle.dumps(game_data)) logger.info("Game saved.")
def process_fragments(self): """Set fragment names, create the vectors for each fragment, run the classifier and add links from the classifier to the fragments.""" os.chdir(self.DIR_CLASSIFIER) perl = self.tarsqi_instance.getopt_perl() for fragment in self.fragments: print fragment # set fragment names base = fragment[0] fin = os.path.join(self.DIR_DATA, base + '.' + self.CREATION_EXTENSION) ftmp = os.path.join(self.DIR_DATA, base + '.' + self.TMP_EXTENSION) fout = os.path.join(self.DIR_DATA, base + '.' + self.RETRIEVAL_EXTENSION) # process them #self._create_vectors(in, in+'.EE2', in+'.ET2', fragment) fin_ee = fin + '.EE' fin_et = fin + '.ET' ee_model = 'data/op.e-e.model' et_model = 'data/op.e-t.model' commands = [ "%s prepareClassifier.pl %s %s %s" % (perl, fin, fin_ee, fin_et), "./mxtest.opt -input %s -model %s -output %s.REL >> class.log" % (fin_ee, ee_model, fin_ee), "./mxtest.opt -input %s -model %s -output %s.REL >> class.log" % (fin_et, et_model, fin_et), "%s collectClassifier.pl %s %s %s" % (perl, fin_ee, fin_et, ftmp) ] for command in commands: logger.info(command) os.system(command) self._add_tlinks_to_fragment(fin, ftmp, fout) os.chdir(TTK_ROOT)
def create_tables(tables, reset=False): for table in tables: if not table.table_exists(): table.create_table() logger.info('Table %s created' % table.__name__) elif reset: table.drop_table(cascade=True) logger.info('Existing table %s dropped' % table.__name__) table.create_table() logger.info('Table %s created' % table.__name__) else: logger.info('Table %s already exists' % table.__name__) return
def process_item(self, item, spider): db_write = getattr(spider, 'db_write', None) if db_write: try: article, url_inserted = Article.get_or_create( article_url=item['article_url']) if url_inserted: subs_article, relation_created = SubscriptionArticle.get_or_create( subscription=item['subscription_id'], article=article.id) logger.info( 'article_url [ID:%s] is now associated with index_url [ID:%s]', subs_article.article, subs_article.subscription) else: subs_article, relation_created = SubscriptionArticle.get_or_create( subscription=item['subscription_id'], article=article.id) if not relation_created: logger.info( 'relation between article_url [ID:%s] and index_url [ID:%s] has been ignored', subs_article.article, subs_article.subscription) else: logger.info( 'article_url [ID:%s] has created a new relationship with index_url [ID:%s]', subs_article.article, subs_article.subscription) except (RuntimeError, KeyError, NameError) as e: logger.error('%s happened when handling %s', str(e), item['article_url']) raise RuntimeError('Error received from Scrapy Pipelines')
def __iter__(self): self.total = 0 for root, subdirs, files in os.walk(self.source): for fname in files: full_path = os.path.join(root, fname) if '.bin' in full_path or '.npy' in full_path: logger.debug('Ignore %s', fname) else: self.totalFiles += 1 if self.totalFiles % 10000 == 0: logger.info('Processed %i files and %i sentences', self.totalFiles, self.total) with open(full_path, encoding='utf8') as file: try: text = file.read() text = text.replace('\n', '') result = self.lexicon.review_to_sentences(utils.to_unicode(text)) for sentence in result: self.total += 1 yield sentence except: logger.error("failed processing file: %s", fname) logger.info('Processed %i', self.total)
def process_fragments(self): """Set fragment names, create the vectors for each fragment, run the classifier and add links from the classifier to the fragments.""" os.chdir(self.DIR_CLASSIFIER) perl = self.tarsqi_instance.getopt_perl() for fragment in self.fragments: # set fragment names base = fragment[0] fin = os.path.join(self.DIR_DATA, base+'.'+self.CREATION_EXTENSION) ftmp = os.path.join(self.DIR_DATA, base+'.'+self.TMP_EXTENSION) fout = os.path.join(self.DIR_DATA, base+'.'+self.RETRIEVAL_EXTENSION) # process them #self._create_vectors(in, in+'.EE2', in+'.ET2', fragment) fin_ee = fin + '.EE' fin_et = fin + '.ET' ee_model = 'data/op.e-e.model' et_model = 'data/op.e-t.model' commands = [ "%s prepareClassifier.pl %s %s %s" % (perl, fin, fin_ee, fin_et), "./mxtest.opt -input %s -model %s -output %s.REL >> class.log" % (fin_ee, ee_model, fin_ee), "./mxtest.opt -input %s -model %s -output %s.REL >> class.log" % (fin_et, et_model, fin_et), "%s collectClassifier.pl %s %s %s" % (perl, fin_ee, fin_et, ftmp) ] for command in commands: logger.info(command) os.system(command) self._add_tlinks_to_fragment(fin, ftmp, fout) os.chdir(TTK_ROOT)
def __iter__(self): self.total = 0 """Iterate through the lines in the source.""" try: # Assume it is a file-like object and try treating it as such # Things that don't have seek will trigger an exception self.source.seek(0) for line in itertools.islice(self.source, self.limit): result = self.lexicon.review_to_sentences(utils.to_unicode(line)) for sentence in result: if self.total % 10000 == 0: logger.info('Processed %i sentences', self.total) self.total += 1 yield sentence except AttributeError: # If it didn't work like a file, use it as a string filename with utils.smart_open(self.source) as fin: for line in itertools.islice(fin, self.limit): result = self.lexicon.review_to_sentences(utils.to_unicode(line)) for sentence in result: if self.total % 10000 == 0: logger.info('Processed %i sentences', self.total) self.total += 1 yield sentence
def measure_performance(test_y, result_y): report = metrics.classification_report(test_y, result_y, digits=3) logger.info('\n{}'.format(report)) if len(np.unique(test_y)) != 1: macro = metrics.f1_score(test_y, result_y, average='macro') logger.info('Macro F1 {0:.3f}'.format(macro)) macro = metrics.f1_score(test_y, result_y, average='micro') logger.info('Micro F1 {0:.3f}'.format(macro))
def get_dictionary(self, saved_path, read_path): if not Game.DICTIONARY: # load a saved dictionary object or construct a new one if os.path.exists(saved_path): logger.info("loading saved dictionary file...") Game.DICTIONARY = Dictionary.load_from_pickle(saved_path) else: logger.info("constructing dictionary...") Game.DICTIONARY = Dictionary.construct_with_text_file( read_path) logger.info("saving dictionary structure...") Game.DICTIONARY.store(saved_path) return Game.DICTIONARY
def get_data(self): all_data_path = path.join(self.bin_location, 'all') if not Path(all_data_path).exists(): makedirs(all_data_path) data_file = Path(all_data_path + '_data.npy') class_file = Path(all_data_path + '_class.npy') name_file = Path(all_data_path + '_name.npy') if data_file.exists(): logger.info('Found created file. Loading %s...', str(data_file)) data = np.load(str(data_file)) type_data = np.load(str(class_file)) names_data = np.load(str(name_file)) logger.info('Using saved data %s with %i records', str(data_file), len(data)) return data, names_data, type_data vectors = NumpyDynamic(np.object) values = NumpyDynamic(np.int32) file_names = NumpyDynamic(np.object) length = [] for item_class, name, item in self: vectors.add(item) file_names.add(name) values.add(item_class) length.append(len(item)) data = vectors.finalize() names_data = file_names.finalize() type_data = values.finalize() if len(data) == 0: raise StandardError("No files found") total =(float(len(length) + 0.1)) logger.info("Loaded %s - %i with average length %6.2f, min: %i and max %i", self.data_path, len(data), sum(length) / total, min(length), max(length)) logger.info('Saving %s', str(data_file)) np.save(str(data_file), data) np.save(str(class_file), type_data) np.save(str(name_file), names_data) return data, names_data, type_data
def __init__(self, filename="", board="wwf11"): """constructor for a game Args: filename: the filename of a saved game if specified """ logger.info("Initializing game...") # load the state of the board from a saved game if filename: filename = filename + ".p" if filename[-2:] != ".p" else filename logger.info("loading saved game from \"{}\"...".format(filename)) game_data = self.__load_game_data_from_file(filename) self.board_type = game_data["board_type"] self.board = game_data["board"] self.filename = game_data["filename"] else: logger.info("starting new game and initializing board...") self.board_type = board self.board = Board(board) self.filename = None resource_directory = os.path.join(resource_dir, self.board_type) tile_path = os.path.join(resource_directory, "tile_list.txt") dictionary_path = os.path.join(resource_directory, "dictionary.txt") saved_dictionary_path = os.path.join(resource_directory, "dictionary.p") # load the list of tiles and their corresponding scores self.tiles = self.__load_tile_set_from_file(tile_path) self.dictionary = self.get_dictionary(saved_dictionary_path, dictionary_path) logger.info("Game initialized successfully.")
def process(self): """Retrieve the element tags from the TarsqiDocument and hand the text for the elements as strings to the preprocessing chain. The result is a shallow tree with sentences and tokens. These are inserted into the TarsqiDocument's tags TagRepositories.""" TagId.reset() for element in self.document.elements(): text = self.document.source.text[element.begin:element.end] tokens = self.tokenize_text(text) adjust_lex_offsets(tokens, element.begin) text = self.tag_text(tokens) # TODO: add some code to get lemmas when the TreeTagger just gets # <unknown>, see https://github.com/tarsqi/ttk/issues/5 text = self.chunk_text(text) export(text, self.document) logger.info("tokenizer processing time: %.3f seconds" % self.tokenize_time) logger.info("tagger processing time: %.3f seconds" % self.tag_time) logger.info("chunker processing time: %.3f seconds" % self.chunk_time)
def _log_duration(alpha_name, beta_name, duration): template = u'Processed {}, {} in {} seconds' message = template.format(alpha_name, beta_name, duration) logger.info(message)