Example #1
0
def run_tarsqi(args):
    """Main method that is called when the script is executed from the command
    line. It creates a Tarsqi instance and lets it process the input. If the
    input is a directory, this method will iterate over the contents, setting up
    TrasqiControlInstances for all files in the directory. The arguments are the
    list of arguments given by the user on the command line. There is no return
    value."""
    (opts, args) = _read_arguments(args)
    if len(args) < 2:
        raise TarsqiError("missing input or output arguments\n%s" %
                          _usage_string())
    # Use os.path.abspath here because some components change the working
    # directory and when some component fails the cwd may not be reset to the
    # root directory
    inpath = os.path.abspath(args[0])
    outpath = os.path.abspath(args[1])
    t0 = time.time()
    if os.path.isdir(inpath) and os.path.isdir(outpath):
        for file in os.listdir(inpath):
            infile = inpath + os.sep + file
            outfile = outpath + os.sep + file
            if os.path.isfile(infile):
                print infile
                Tarsqi(opts, infile, outfile).process()
    elif os.path.isfile(inpath):
        if os.path.exists(outpath):
            raise TarsqiError('output file ' + outpath + ' already exists')
        Tarsqi(opts, inpath, outpath).process()
    else:
        raise TarsqiError('Invalid input and/or output options')
    logger.info("TOTAL PROCESSING TIME: %.3f seconds" % (time.time() - t0))
Example #2
0
def run_tarsqi(args):

    """Main method that is called when the script is executed. It creates
    a TarsqiControl instance and lets it process the input. If the
    input is a directory, this method will iterate over the contents,
    setting up TrasqiControlInstances for all files in the directory. 

    The arguments are the list of arguments given by the user on the
    command line. There is no return value."""

    (input_type, opts, input, output) = read_arguments(args)

    begin_time = time.time()

    if os.path.isdir(input) and os.path.isdir(output):
        for file in os.listdir(input):
            infile = input + os.sep + file
            outfile = output + os.sep + file
            if os.path.isfile(infile):
                print infile
                TarsqiControl(input_type, opts, infile, outfile).process()

    elif os.path.isfile(input):
        if os.path.exists(output):
            sys.exit('ERROR: output file ' + output + ' already exists')
        TarsqiControl(input_type, opts, input, output).process()

    else:
        sys.exit('Invalid input and/or output parameters')

    end_time = time.time()
    logger.info("TOTAL PROCESSING TIME: %.3f seconds" % (end_time - begin_time))
Example #3
0
def run_tarsqi(args):
    """Main method that is called when the script is executed from the command
    line. It creates a Tarsqi instance and lets it process the input. If the
    input is a directory, this method will iterate over the contents, setting up
    TrasqiControlInstances for all files in the directory. The arguments are the
    list of arguments given by the user on the command line. There is no return
    value."""
    (opts, args) = _read_arguments(args)
    if len(args) < 2:
        raise TarsqiError("missing input or output arguments\n%s"
                          % _usage_string())
    # Use os.path.abspath here because some components change the working
    # directory and when some component fails the cwd may not be reset to the
    # root directory
    inpath = os.path.abspath(args[0])
    outpath = os.path.abspath(args[1])
    t0 = time.time()
    if os.path.isdir(inpath) and os.path.isdir(outpath):
        for file in os.listdir(inpath):
            infile = inpath + os.sep + file
            outfile = outpath + os.sep + file
            if os.path.isfile(infile):
                print infile
                Tarsqi(opts, infile, outfile).process()
    elif os.path.isfile(inpath):
        if os.path.exists(outpath):
            raise TarsqiError('output file ' + outpath + ' already exists')
        Tarsqi(opts, inpath, outpath).process()
    else:
        raise TarsqiError('Invalid input and/or output options')
    logger.info("TOTAL PROCESSING TIME: %.3f seconds" % (time.time() - t0))
Example #4
0
    def get_stock_data(self, stock_name):
        """
                Return a dataframe of that stock and normalize all the values.
                (Optional: create moving average)
                """
        logger.info("Loading Stock [%s]...", stock_name)
        df = quandl.get_table('WIKI/PRICES', ticker=stock_name, paginate=True)
        df.drop([
            'ticker', 'open', 'high', 'low', 'close', 'ex-dividend', 'volume',
            'split_ratio'
        ],
                1,
                inplace=True)
        df.set_index('date', inplace=True)

        # Renaming all the columns so that we can use the old version code
        df.rename(columns={
            'adj_open': 'Open',
            'adj_high': 'High',
            'adj_low': 'Low',
            'adj_volume': 'Volume',
            'adj_close': HeaderFactory.Price
        },
                  inplace=True)

        df.sort_index(ascending=True, inplace=True)
        df.dropna(inplace=True)
        return df
Example #5
0
def run_tarsqi(args):

    """Main method that is called when the script is executed. It creates
    a TarsqiControl instance and lets it process the input. If the
    input is a directory, this method will iterate over the contents,
    setting up TrasqiControlInstances for all files in the directory. 

    The arguments are the list of arguments given by the user on the
    command line. There is no return value."""

    (input_type, opts, input, output) = read_arguments(args)

    begin_time = time.time()

    if os.path.isdir(input) and os.path.isdir(output):
        for file in os.listdir(input):
            infile = input + os.sep + file
            outfile = output + os.sep + file
            if os.path.isfile(infile):
                print infile
                TarsqiControl(input_type, opts, infile, outfile).process()

    elif os.path.isfile(input):
#         if os.path.exists(output):
#             sys.exit('ERROR: output file ' + output + ' already exists')
        TarsqiControl(input_type, opts, input, output).process()

    else:
        sys.exit('Invalid input and/or output parameters')

    end_time = time.time()
    logger.info("TOTAL PROCESSING TIME: %.3f seconds" % (end_time - begin_time))
Example #6
0
 def process_string(self, input_string):
     """Similar to process(), except that it runs on an input string and not
     on a file, it does not write the output to a file and it returns the
     TarsqiDocument."""
     logger.info(input_string)
     self.document = self.source_parser.parse_string(input_string)
     self._process_document()
     return self.document
Example #7
0
 def process_string(self, input_string):
     """Similar to process(), except that it runs on an input string and not
     on a file, it does not write the output to a file and it returns the
     TarsqiDocument."""
     logger.info(input_string)
     self.document = self.source_parser.parse_string(input_string)
     self._process_document()
     return self.document
    def __iter__(self):
        logger.info("Loading %s...", self.data_path)

        for (root, dir_names, files) in walk(self.data_path):
            for name in files:
                file_name = path.join(root, name)
                data = self.source.get_vector(file_name)
                yield data, name
Example #9
0
 def _log_duration(duration, file_name, length):
     template = 'Preprocessed {} of length {} in {} seconds ' \
                '({} char per second)'
     message = template.format(file_name,
                               length,
                               duration,
                               length/duration)
     logger.info(message)
Example #10
0
    def apply_component(self, name, wrapper, infile, outfile):

        """Apply a component if the processing parameters determine that the
        component needs to be applied. This method passes the content
        tag and the xml_document to the wrapper of the component and
        asks the wrapper to process the document fragments. 

        Component-level errors are trapped here if trap_errors is True.

        Arguments:
           name - string, the name of the component
           wrapper - instance of a subclass of ComponentWrapper
           infile - string
           outfile - string

        Return value: None"""

        # NOTES
        
        # - Components still write results to file, which is not
        #   conform to the specs. But writing files to disk is but a
        #   minor part of processing time so for now we'll leave it
        #   here and let all components assume that there is an input
        #   file to work with.

        # - Having said that, it is not quite true that the wrappers
        #   use the input file. The wrappers use the xml document and
        #   the content tag and then (i) create fragments from the xml
        #   doc, (ii) process the fragments, (iii) reinsert the
        #   fragments in the xml doc, and (iv) write the xml doc to a
        #   file. But the file rated is not opened by the next
        #   wrapper.

        # - Errors are now trapped here instead of in the component
        #   since we do not tell the component what the output file
        #   is.

        def call_wrapper(wrapper, content_tag, xmldoc, trap_errors, outfile):
            wrapper(content_tag, xmldoc, self).process()
            self.xml_document.save_to_file(outfile)

        logger.info("RUNNING " + name + " on: " + infile)
        #logger.out('Running', name)
        trap_errors = self.getopt_trap_errors()
        if trap_errors:
            try:
                call_wrapper(wrapper, self.content_tag, self.xml_document,
                             trap_errors, outfile)
            except:
                logger.error(name + " error on " + infile + "\n\t"
                             + str(sys.exc_type) + "\n\t"
                             + str(sys.exc_value) + "\n")
                shutil.copy(infile, outfile)
        else:
            call_wrapper(wrapper, self.content_tag, self.xml_document,
                         trap_errors, outfile)
Example #11
0
 def count_occurences(y):
     if len(y.shape) > 1 and y.shape[1] > 1:
         for current in range(0, y.shape[1]):
             total = sum(y[:, current])
             logger.info("Found %s type with %i records", current, total)
     else:
         types = np.unique(y)
         for current in types:
             total = sum(y == current)
             logger.info("Found %s type with %i records", current, total)
Example #12
0
    def apply_component(self, name, wrapper, infile, outfile):
        """Apply a component if the processing parameters determine that the
        component needs to be applied. This method passes the content
        tag and the xml_document to the wrapper of the component and
        asks the wrapper to process the document fragments. 

        Component-level errors are trapped here if trap_errors is True.

        Arguments:
           name - string, the name of the component
           wrapper - instance of a subclass of ComponentWrapper
           infile - string
           outfile - string

        Return value: None"""

        # NOTES

        # - Components still write results to file, which is not
        #   conform to the specs. But writing files to disk is but a
        #   minor part of processing time so for now we'll leave it
        #   here and let all components assume that there is an input
        #   file to work with.

        # - Having said that, it is not quite true that the wrappers
        #   use the input file. The wrappers use the xml document and
        #   the content tag and then (i) create fragments from the xml
        #   doc, (ii) process the fragments, (iii) reinsert the
        #   fragments in the xml doc, and (iv) write the xml doc to a
        #   file. But the file rated is not opened by the next
        #   wrapper.

        # - Errors are now trapped here instead of in the component
        #   since we do not tell the component what the output file
        #   is.

        def call_wrapper(wrapper, content_tag, xmldoc, trap_errors, outfile):
            wrapper(content_tag, xmldoc, self).process()
            self.xml_document.save_to_file(outfile)

        logger.info("RUNNING " + name + " on: " + infile)
        #logger.out('Running', name)
        trap_errors = self.getopt_trap_errors()
        if trap_errors:
            try:
                call_wrapper(wrapper, self.content_tag, self.xml_document,
                             trap_errors, outfile)
            except:
                logger.error(name + " error on " + infile + "\n\t" +
                             str(sys.exc_type) + "\n\t" + str(sys.exc_value) +
                             "\n")
                shutil.copy(infile, outfile)
        else:
            call_wrapper(wrapper, self.content_tag, self.xml_document,
                         trap_errors, outfile)
Example #13
0
 def process_string(self, input_string):
     """Similar to process(), except that it runs on an input string and not
     on a file, it does not write the output to a file and it returns the
     TarsqiDocument."""
     logger.info(input_string[:75].replace('\n', ' '))
     self.source_parser.parse_string(input_string, self.tarsqidoc)
     self.metadata_parser.parse(self.tarsqidoc)
     self.docstructure_parser.parse(self.tarsqidoc)
     for (name, wrapper) in self.pipeline:
         self._apply_component(name, wrapper, self.tarsqidoc)
     return self.tarsqidoc
Example #14
0
File: tarsqi.py Project: tarsqi/ttk
 def process_string(self, input_string):
     """Similar to process(), except that it runs on an input string and not
     on a file, it does not write the output to a file and it returns the
     TarsqiDocument."""
     logger.info(input_string[:75].replace('\n', ' '))
     self.source_parser.parse_string(input_string, self.tarsqidoc)
     self.metadata_parser.parse(self.tarsqidoc)
     self.docstructure_parser.parse(self.tarsqidoc)
     for (name, wrapper) in self.pipeline:
         self._apply_component(name, wrapper, self.tarsqidoc)
     return self.tarsqidoc
Example #15
0
 def process(self):
     """Parse the source with the source parser, the metadata parser and the
     document structure parser, apply all components and write the results to
     a file. The actual processing itself is driven using the processing
     options set at initialization. Components are given the TarsqiDocument
     and update it."""
     if not self._skip_file():
         self._cleanup_directories()
         logger.info(self.input)
         self.document = self.source_parser.parse_file(self.input)
         self._process_document()
         self._write_output()
Example #16
0
 def process(self):
     """Parse the source with the source parser, the metadata parser and the
     document structure parser, apply all components and write the results to
     a file. The actual processing itself is driven using the processing
     options set at initialization. Components are given the TarsqiDocument
     and update it."""
     if not self._skip_file():
         self._cleanup_directories()
         logger.info(self.input)
         self.document = self.source_parser.parse_file(self.input)
         self._process_document()
         self._write_output()
Example #17
0
 def get_stock_data(self, stock_name):
     file_path = path.join(Constants.DATASETS_MARKET, 'reddit/DJIA_table.csv')
     logger.info("Loading [%s]...", file_path)
     market_data = pd.read_csv(file_path, na_values=['nan'])
     # drop unadjusted close
     market_data.Date = pd.to_datetime(market_data.Date, format='%Y-%m-%d')
     market_data.set_index('Date', inplace=True)
     market_data.reindex()
     market_data.sort_index(ascending=True, inplace=True)
     market_data.drop(labels=[HeaderFactory.Price], axis=1, inplace=True)
     market_data.rename(columns={"Adj Close": HeaderFactory.Price}, inplace=True)
     market_data.dropna(inplace=True)
     return market_data
Example #18
0
def feed_subscription_url_from_xml(fname):
    if not os.path.exists(fname):
        logger.warn('%s does not exist' % fname)
        return

    counter = 0
    with open_workbook(fname) as wb:
        s = wb.sheet_by_index(0)
        for row in range(1, s.nrows):
            Subscription.get_or_create(
                index_url=s.cell(row, 1).value
            )
            counter += 1
        logger.info('%s index_url(s) has been inserted' % counter)
    return
Example #19
0
 def process_document(self):
     """Parse the source with the source parser, the metadata parser and the
     document structure parser, apply all components and write the results to
     a file. The actual processing itself is driven using the processing
     options set at initialization. Components are given the TarsqiDocument
     and update it."""
     self._cleanup_directories()
     logger.info(self.input)
     logger.info("Source type is '%s'" % self.options.source)
     self.source_parser.parse_file(self.input, self.tarsqidoc)
     self.metadata_parser.parse(self.tarsqidoc)
     self.docstructure_parser.parse(self.tarsqidoc)
     for (name, wrapper) in self.pipeline:
         self._apply_component(name, wrapper, self.tarsqidoc)
     self._write_output()
Example #20
0
File: tarsqi.py Project: tarsqi/ttk
 def process_document(self):
     """Parse the source with the source parser, the metadata parser and the
     document structure parser, apply all components and write the results to
     a file. The actual processing itself is driven using the processing
     options set at initialization. Components are given the TarsqiDocument
     and update it."""
     self._cleanup_directories()
     logger.info(self.input)
     logger.info("Source type is '%s'" % self.options.source)
     self.source_parser.parse_file(self.input, self.tarsqidoc)
     self.metadata_parser.parse(self.tarsqidoc)
     self.docstructure_parser.parse(self.tarsqidoc)
     for (name, wrapper) in self.pipeline:
         self._apply_component(name, wrapper, self.tarsqidoc)
     self._write_output()
Example #21
0
 def get_stock_data(self, stock_name: str):
     file_path = path.join(Constants.DATASETS_MARKET, 'stock/{}.csv'.format(stock_name))
     logger.info("Loading [%s]...", file_path)
     market_data = pd.read_csv(file_path, na_values=['nan'])
     # drop unadjusted close
     market_data.Date = pd.to_datetime(market_data.Date, format='%Y-%m-%d')
     market_data.set_index('Date', inplace=True)
     market_data.reindex()
     market_data.sort_index(ascending=True, inplace=True)
     if 'curncy' in stock_name.lower():
         market_data.drop(labels=["PX_VOLUME"], axis=1, inplace=True)
     market_data.rename(columns={'PX_OPEN': 'Open', 'PX_HIGH': 'High', 'PX_LOW': 'Low', 'PX_VOLUME': 'Volume',
                        'PX_LAST': HeaderFactory.Price}, inplace=True)
     # market_data.dropna(inplace=True)
     return market_data
Example #22
0
    def measure_performance_auc(test_y, result_y, result_y_prob):
        try:
            vacc = metrics.accuracy_score(test_y, result_y)
            # find validation AUC
            if len(np.unique(test_y)) == 2:
                vauc = roc_auc_score(test_y, result_y_prob)
                logger.info('Accurary: {0:.3f} and AUC {1:.3f}'.format(
                    vacc, vauc))
            else:
                vauc = None
                logger.info('Accurary: {0:.3f}'.format(vacc))

            return vacc, vauc
        except:
            logger.error("Error calculating metrics")
Example #23
0
    def process_training(self):
        """This is the method that is called from the TarsqiControl
        class. Fragments are created and processed. It's a training version
        of process(self). Instead of using process_fragments inside the 
        individual component, it uses process_training_fragments instead.
        """

        self.create_fragments(self.tag, 'fragment')
        begin_time = time.time()
        self.process_training_fragments()

        end_time = time.time()
        total_time = end_time - begin_time
        logger.info("%s DONE, TRAINING processing time was %.3f seconds" %
                    (self.component_name, total_time))
        self.retrieve_fragments('fragment')
 def __iter__(self):
     with io.open(self.file_name, 'rt', encoding='utf8') as csv_file:
         logger.info('Loading: %s', self.file_name)
         for line in csv_file:
             row = re.split(r'\t+', line)
             review_id = row[0]
             total_rows = len(row)
             if total_rows >= 3:
                 type_class = self.convertor.is_supported(row[total_rows - 2])
                 if type_class is not None:
                     text = row[total_rows - 1]
                     vector = self.source.get_vector_from_review(text)
                     if vector is not None:
                         yield type_class, review_id, vector
                     else:
                         logger.warn("Vector not found: %s", text)
Example #25
0
    def process(self):
        """This is the method that is called from the TarsqiControl
        class. Fragments are created, processed and retrieved. The
        method that processes fragments (process_fragment) should be
        defined for each wrapper individually.

        No arguments and no return value."""

        self.create_fragments(self.tag, 'fragment')
        begin_time = time.time()
        self.process_fragments()
        end_time = time.time()
        total_time = end_time - begin_time
        logger.info("%s DONE, processing time was %.3f seconds" %
                    (self.component_name, total_time))
        self.retrieve_fragments('fragment')
Example #26
0
 def run(self):
     """Main method that is called when the script is executed from the command
     line. It creates a Tarsqi instance and lets it process the input. If the
     input is a directory, this method will iterate over the contents, setting up
     Tarsqi instances for all files in the directory. The arguments are the list
     of arguments given by the user on the command line."""
     t0 = time.time()
     if self.inpath is None and self.outpath is None:
         self._run_tarsqi_on_pipe()
     elif os.path.isdir(self.inpath):
         self._run_tarsqi_on_directory()
     elif os.path.isfile(self.inpath):
         self._run_tarsqi_on_file()
     else:
         raise TarsqiError('Invalid input')
     logger.info("TOTAL PROCESSING TIME: %.3f seconds" % (time.time() - t0))
     logger.report()
Example #27
0
    def process(self):

        """This is the method that is called from the TarsqiControl
        class. Fragments are created, processed and retrieved. The
        method that processes fragments (process_fragment) should be
        defined for each wrapper individually.

        No arguments and no return value."""

        self.create_fragments(self.tag, 'fragment')
        begin_time = time.time()
        self.process_fragments()
        end_time = time.time()
        total_time = end_time - begin_time
        logger.info("%s DONE, processing time was %.3f seconds" %
                    (self.component_name, total_time))
        self.retrieve_fragments('fragment')
Example #28
0
def process(args):
    """
    Run processing step
    """
    start = time.time()
    alpha_files = path.iter_files_in(args.preprocessed_dir)
    beta_files = path.iter_files_in(args.preprocessed_dir)
    if args.parallel:
        cnt = process_parallel(args, alpha_files, beta_files)
    else:
        cnt = process_serial(args, alpha_files, beta_files)

    duration = time.time() - start
    if duration == 0:
        duration = 1
    comparisons_per_sec = cnt / duration
    logger.info('Processed {} files per second'.format(comparisons_per_sec))
Example #29
0
File: tarsqi.py Project: tarsqi/ttk
 def _apply_component(self, name, wrapper, tarsqidocument):
     """Apply a component by taking the TarsqDocument, which includes the
     options from the Tarsqi instance, and passing it to the component
     wrapper. Component-level errors are trapped here if --trap-errors is
     True. If errors are trapped, it is still possible that partial results
     were written to the TagRepositories in the TarsqiDocument."""
     logger.info(name + '............')
     t1 = time.time()
     if self.options.trap_errors:
         try:
             wrapper(tarsqidocument).process()
         except:
             logger.error("%s error:\n\t%s\n\t%s\n"
                          % (name, sys.exc_type, sys.exc_value))
     else:
         wrapper(tarsqidocument).process()
     logger.info("%s DONE (%.3f seconds)" % (name, time.time() - t1))
Example #30
0
File: DIRT.py Project: gnarph/DIRT
def process(args):
    """
    Run processing step
    """
    start = time.time()
    alpha_files = path.iter_files_in(args.preprocessed_dir)
    beta_files = path.iter_files_in(args.preprocessed_dir)
    if args.parallel:
        cnt = process_parallel(args, alpha_files, beta_files)
    else:
        cnt = process_serial(args, alpha_files, beta_files)

    duration = time.time() - start
    if duration == 0:
        duration = 1
    comparisons_per_sec = cnt/duration
    logger.info('Processed {} files per second'.format(comparisons_per_sec))
Example #31
0
 def _apply_component(self, name, wrapper, tarsqidocument):
     """Apply a component by taking the TarsqDocument, which includes the
     options from the Tarsqi instance, and passing it to the component
     wrapper. Component-level errors are trapped here if --trap-errors is
     True. If errors are trapped, it is still possible that partial results
     were written to the TagRepositories in the TarsqiDocument."""
     logger.info(name + '............')
     t1 = time.time()
     if self.options.trap_errors:
         try:
             wrapper(tarsqidocument).process()
         except:
             logger.error("%s error:\n\t%s\n\t%s\n" %
                          (name, sys.exc_type, sys.exc_value))
     else:
         wrapper(tarsqidocument).process()
     logger.info("%s DONE (%.3f seconds)" % (name, time.time() - t1))
Example #32
0
File: tarsqi.py Project: tarsqi/ttk
 def run(self):
     """Main method that is called when the script is executed from the command
     line. It creates a Tarsqi instance and lets it process the input. If the
     input is a directory, this method will iterate over the contents, setting up
     Tarsqi instances for all files in the directory. The arguments are the list
     of arguments given by the user on the command line."""
     t0 = time.time()
     if self.inpath is None and self.outpath is None:
         self._run_tarsqi_on_pipe()
     elif os.path.isdir(self.inpath):
         self._run_tarsqi_on_directory()
     elif os.path.isfile(self.inpath):
         self._run_tarsqi_on_file()
     else:
         raise TarsqiError('Invalid input')
     logger.info("TOTAL PROCESSING TIME: %.3f seconds" % (time.time() - t0))
     logger.report(sys.stderr)
Example #33
0
    def save(self, filename=None):
        """saves an unfinished game to disk"""

        if not os.path.exists(full_saved_games_dir):
            os.makedirs(full_saved_games_dir)
        self.filename = filename if filename else self.filename if self.filename else generate_file_name(
        )
        logger.info("Saving game to file \"{}\"...".format(self.filename))
        game_data = {
            "board_type": self.board_type,
            "board": self.board,
            "name": self.filename
        }
        with gzip.open(
                os.path.join(full_saved_games_dir, "{}.p".format(filename)),
                "wb") as f:
            f.write(pickle.dumps(game_data))
        logger.info("Game saved.")
Example #34
0
    def process_fragments(self):
        """Set fragment names, create the vectors for each fragment, run the
        classifier and add links from the classifier to the fragments."""

        os.chdir(self.DIR_CLASSIFIER)
        perl = self.tarsqi_instance.getopt_perl()

        for fragment in self.fragments:
            print fragment

            # set fragment names
            base = fragment[0]
            fin = os.path.join(self.DIR_DATA,
                               base + '.' + self.CREATION_EXTENSION)
            ftmp = os.path.join(self.DIR_DATA, base + '.' + self.TMP_EXTENSION)
            fout = os.path.join(self.DIR_DATA,
                                base + '.' + self.RETRIEVAL_EXTENSION)

            # process them
            #self._create_vectors(in, in+'.EE2', in+'.ET2', fragment)

            fin_ee = fin + '.EE'
            fin_et = fin + '.ET'
            ee_model = 'data/op.e-e.model'
            et_model = 'data/op.e-t.model'
            commands = [
                "%s prepareClassifier.pl %s %s %s" %
                (perl, fin, fin_ee, fin_et),
                "./mxtest.opt -input %s -model %s -output %s.REL >> class.log"
                % (fin_ee, ee_model, fin_ee),
                "./mxtest.opt -input %s -model %s -output %s.REL >> class.log"
                % (fin_et, et_model, fin_et),
                "%s collectClassifier.pl %s %s %s" %
                (perl, fin_ee, fin_et, ftmp)
            ]

            for command in commands:
                logger.info(command)
                os.system(command)

            self._add_tlinks_to_fragment(fin, ftmp, fout)

        os.chdir(TTK_ROOT)
Example #35
0
def create_tables(tables, reset=False):
    for table in tables:
        if not table.table_exists():
            table.create_table()
            logger.info('Table %s created' % table.__name__)
        elif reset:
            table.drop_table(cascade=True)
            logger.info('Existing table %s dropped' % table.__name__)
            table.create_table()
            logger.info('Table %s created' % table.__name__)
        else:
            logger.info('Table %s already exists' % table.__name__)
    return
Example #36
0
    def process_item(self, item, spider):
        db_write = getattr(spider, 'db_write', None)

        if db_write:
            try:
                article, url_inserted = Article.get_or_create(
                    article_url=item['article_url'])
                if url_inserted:
                    subs_article, relation_created = SubscriptionArticle.get_or_create(
                        subscription=item['subscription_id'],
                        article=article.id)
                    logger.info(
                        'article_url [ID:%s] is now associated with index_url [ID:%s]',
                        subs_article.article, subs_article.subscription)
                else:
                    subs_article, relation_created = SubscriptionArticle.get_or_create(
                        subscription=item['subscription_id'],
                        article=article.id)
                    if not relation_created:
                        logger.info(
                            'relation between article_url [ID:%s] and index_url [ID:%s] has been ignored',
                            subs_article.article, subs_article.subscription)
                    else:
                        logger.info(
                            'article_url [ID:%s] has created a new relationship with index_url [ID:%s]',
                            subs_article.article, subs_article.subscription)
            except (RuntimeError, KeyError, NameError) as e:
                logger.error('%s happened when handling %s', str(e),
                             item['article_url'])
                raise RuntimeError('Error received from Scrapy Pipelines')
Example #37
0
 def __iter__(self):
     self.total = 0
     for root, subdirs, files in os.walk(self.source):
         for fname in files:
             full_path = os.path.join(root, fname)
             if '.bin' in full_path or '.npy' in full_path:
                 logger.debug('Ignore %s', fname)
             else:
                 self.totalFiles += 1
                 if self.totalFiles % 10000 == 0:
                     logger.info('Processed %i files and %i sentences', self.totalFiles, self.total)
                 with open(full_path, encoding='utf8') as file:
                     try:
                         text = file.read()
                         text = text.replace('\n', '')
                         result = self.lexicon.review_to_sentences(utils.to_unicode(text))
                         for sentence in result:
                             self.total += 1
                             yield sentence
                     except:
                         logger.error("failed processing file: %s", fname)
     logger.info('Processed %i', self.total)
Example #38
0
    def process_fragments(self):

        """Set fragment names, create the vectors for each fragment, run the
        classifier and add links from the classifier to the fragments."""

        os.chdir(self.DIR_CLASSIFIER)
        perl = self.tarsqi_instance.getopt_perl()

        for fragment in self.fragments:

            # set fragment names
            base = fragment[0]
            fin = os.path.join(self.DIR_DATA, base+'.'+self.CREATION_EXTENSION)
            ftmp = os.path.join(self.DIR_DATA, base+'.'+self.TMP_EXTENSION)
            fout = os.path.join(self.DIR_DATA, base+'.'+self.RETRIEVAL_EXTENSION)

            # process them
            #self._create_vectors(in, in+'.EE2', in+'.ET2', fragment)

            fin_ee = fin + '.EE'
            fin_et = fin + '.ET'
            ee_model = 'data/op.e-e.model'
            et_model = 'data/op.e-t.model'
            commands = [
                "%s prepareClassifier.pl %s %s %s" % (perl, fin, fin_ee, fin_et),
                "./mxtest.opt -input %s -model %s -output %s.REL >> class.log" % (fin_ee, ee_model, fin_ee),
                "./mxtest.opt -input %s -model %s -output %s.REL >> class.log" % (fin_et, et_model, fin_et),
                "%s collectClassifier.pl %s %s %s" % (perl, fin_ee, fin_et, ftmp) ]

            for command in commands:
                logger.info(command)
                os.system(command)
                
            self._add_tlinks_to_fragment(fin, ftmp, fout)

        os.chdir(TTK_ROOT)
Example #39
0
 def __iter__(self):
     self.total = 0
     """Iterate through the lines in the source."""
     try:
         # Assume it is a file-like object and try treating it as such
         # Things that don't have seek will trigger an exception
         self.source.seek(0)
         for line in itertools.islice(self.source, self.limit):
             result = self.lexicon.review_to_sentences(utils.to_unicode(line))
             for sentence in result:
                 if self.total % 10000 == 0:
                     logger.info('Processed %i sentences', self.total)
                 self.total += 1
                 yield sentence
     except AttributeError:
         # If it didn't work like a file, use it as a string filename
         with utils.smart_open(self.source) as fin:
             for line in itertools.islice(fin, self.limit):
                 result = self.lexicon.review_to_sentences(utils.to_unicode(line))
                 for sentence in result:
                     if self.total % 10000 == 0:
                         logger.info('Processed %i sentences', self.total)
                     self.total += 1
                     yield sentence
Example #40
0
    def measure_performance(test_y, result_y):
        report = metrics.classification_report(test_y, result_y, digits=3)
        logger.info('\n{}'.format(report))

        if len(np.unique(test_y)) != 1:
            macro = metrics.f1_score(test_y, result_y, average='macro')
            logger.info('Macro F1 {0:.3f}'.format(macro))

            macro = metrics.f1_score(test_y, result_y, average='micro')
            logger.info('Micro F1 {0:.3f}'.format(macro))
Example #41
0
 def get_dictionary(self, saved_path, read_path):
     if not Game.DICTIONARY:
         # load a saved dictionary object or construct a new one
         if os.path.exists(saved_path):
             logger.info("loading saved dictionary file...")
             Game.DICTIONARY = Dictionary.load_from_pickle(saved_path)
         else:
             logger.info("constructing dictionary...")
             Game.DICTIONARY = Dictionary.construct_with_text_file(
                 read_path)
             logger.info("saving dictionary structure...")
             Game.DICTIONARY.store(saved_path)
     return Game.DICTIONARY
    def get_data(self):
        all_data_path = path.join(self.bin_location, 'all')
        if not Path(all_data_path).exists():
            makedirs(all_data_path)

        data_file = Path(all_data_path + '_data.npy')
        class_file = Path(all_data_path + '_class.npy')
        name_file = Path(all_data_path + '_name.npy')
        if data_file.exists():
            logger.info('Found created file. Loading %s...', str(data_file))
            data = np.load(str(data_file))
            type_data = np.load(str(class_file))
            names_data = np.load(str(name_file))
            logger.info('Using saved data %s with %i records', str(data_file), len(data))
            return data, names_data, type_data

        vectors = NumpyDynamic(np.object)
        values = NumpyDynamic(np.int32)
        file_names = NumpyDynamic(np.object)
        length = []
        for item_class, name, item in self:
            vectors.add(item)
            file_names.add(name)
            values.add(item_class)
            length.append(len(item))

        data = vectors.finalize()
        names_data = file_names.finalize()
        type_data = values.finalize()

        if len(data) == 0:
            raise StandardError("No files found")
        total =(float(len(length) + 0.1))
        logger.info("Loaded %s - %i with average length %6.2f, min: %i and max %i", self.data_path, len(data),
                    sum(length) / total, min(length), max(length))
        logger.info('Saving %s', str(data_file))
        np.save(str(data_file), data)
        np.save(str(class_file), type_data)
        np.save(str(name_file), names_data)
        return data, names_data, type_data
Example #43
0
    def __init__(self, filename="", board="wwf11"):
        """constructor for a game

        Args:
            filename: the filename of a saved game if specified

        """

        logger.info("Initializing game...")

        # load the state of the board from a saved game
        if filename:
            filename = filename + ".p" if filename[-2:] != ".p" else filename
            logger.info("loading saved game from \"{}\"...".format(filename))
            game_data = self.__load_game_data_from_file(filename)
            self.board_type = game_data["board_type"]
            self.board = game_data["board"]
            self.filename = game_data["filename"]
        else:
            logger.info("starting new game and initializing board...")
            self.board_type = board
            self.board = Board(board)
            self.filename = None

        resource_directory = os.path.join(resource_dir, self.board_type)
        tile_path = os.path.join(resource_directory, "tile_list.txt")
        dictionary_path = os.path.join(resource_directory, "dictionary.txt")
        saved_dictionary_path = os.path.join(resource_directory,
                                             "dictionary.p")

        # load the list of tiles and their corresponding scores
        self.tiles = self.__load_tile_set_from_file(tile_path)

        self.dictionary = self.get_dictionary(saved_dictionary_path,
                                              dictionary_path)

        logger.info("Game initialized successfully.")
Example #44
0
 def process(self):
     """Retrieve the element tags from the TarsqiDocument and hand the text for the
     elements as strings to the preprocessing chain. The result is a shallow
     tree with sentences and tokens. These are inserted into the
     TarsqiDocument's tags TagRepositories."""
     TagId.reset()
     for element in self.document.elements():
         text = self.document.source.text[element.begin:element.end]
         tokens = self.tokenize_text(text)
         adjust_lex_offsets(tokens, element.begin)
         text = self.tag_text(tokens)
         # TODO: add some code to get lemmas when the TreeTagger just gets
         # <unknown>, see https://github.com/tarsqi/ttk/issues/5
         text = self.chunk_text(text)
         export(text, self.document)
     logger.info("tokenizer processing time: %.3f seconds" % self.tokenize_time)
     logger.info("tagger processing time: %.3f seconds" % self.tag_time)
     logger.info("chunker processing time: %.3f seconds" % self.chunk_time)
Example #45
0
 def _log_duration(alpha_name, beta_name, duration):
     template = u'Processed {}, {} in {} seconds'
     message = template.format(alpha_name,
                               beta_name,
                               duration)
     logger.info(message)