Esempio n. 1
0
 def _find_lexically_based_slink(self, sentence, event_expr):
     """Try to find lexically based Slinks for an instance of EventExpression using
     forward, backward and reporting FSA paterns. No return value, if an
     Slink is found, it will be created by the chunk that embeds the Slink
     triggering event."""
     evNode = sentence[event_expr.locInSent]
     if evNode is None:
         logger.error("No node found at locInSent=%s" %
                      event_expr.locInSent)
         return
     slink_created = False
     logger.debug("Sentence element class: %s" % evNode.__class__.__name__)
     forwardFSAs = event_expr.slinkingContexts('forward')
     if forwardFSAs:
         logger.debug("Applying FORWARD slink FSAs")
         slink_created = evNode.find_forward_slink(forwardFSAs)
         logger.debug("forward slink created = %s" % slink_created)
     if not slink_created:
         backwardFSAs = event_expr.slinkingContexts('backwards')
         if backwardFSAs:
             logger.debug("Applying BACKWARD slink FSAs")
             slink_created = evNode.find_backward_slink(backwardFSAs)
             logger.debug("backward slink created = %s" % slink_created)
     if not slink_created:
         reportingFSAs = event_expr.slinkingContexts('reporting')
         if reportingFSAs:
             logger.debug("Applying REPORTING slink FSAs")
             slink_created = evNode.find_reporting_slink(reportingFSAs)
         logger.debug("reporting slink created = %s" % slink_created)
Esempio n. 2
0
    def process_fragments(self):

        """Set fragment names, create the vectors for each fragment, run the
        classifier and add links from the classifier to the fragments."""

        os.chdir(self.DIR_LINK_MERGER + os.sep + 'sputlink')
        perl = '/usr/local/ActivePerl-5.8/bin/perl'
        perl = 'perl'
        perl = self.tarsqi_instance.getopt_perl()

        for fragment in self.fragments:
            # set fragment names
            base = fragment[0]
            in_fragment = os.path.join(self.DIR_DATA, base+'.'+self.CREATION_EXTENSION)
            tmp_fragment = os.path.join(self.DIR_DATA, base+'.'+self.TMP_EXTENSION)
            out_fragment = os.path.join(self.DIR_DATA, base+'.'+self.RETRIEVAL_EXTENSION)
            # process them
            command = "%s merge.pl %s %s" % (perl, in_fragment, tmp_fragment)
            (i, o, e) = os.popen3(command)
            for line in e:
                if line.lower().startswith('warn'):
                    logger.warn('MERGING: ' + line)
                else:
                    logger.error('MERGING: ' + line)
            for line in o:
                logger.debug('MERGING: ' + line)
            self._add_tlinks_to_fragment(in_fragment, tmp_fragment, out_fragment)
        os.chdir(TTK_ROOT)
Esempio n. 3
0
 def addInPreviousSublist(self, list, element):
     if len(list) == 0 and self.counter == 0:
         list.append([element])
     elif len(list) >= self.counter - 1:
         list[self.counter - 1].append(element)
     else:
         logger.error("ERROR: list should be longer")
Esempio n. 4
0
 def run(self):
     while not self.stop_run:
         try:
             self.push_to_server()
         except RuntimeError as e:
             logger.error(str(e))
         time.sleep(self.time_interval)
Esempio n. 5
0
    def process_item(self, item, spider):
        db_write = getattr(spider, 'db_write', None)

        if db_write:
            try:
                article, url_inserted = Article.get_or_create(
                    article_url=item['article_url'])
                if url_inserted:
                    subs_article, relation_created = SubscriptionArticle.get_or_create(
                        subscription=item['subscription_id'],
                        article=article.id)
                    logger.info(
                        'article_url [ID:%s] is now associated with index_url [ID:%s]',
                        subs_article.article, subs_article.subscription)
                else:
                    subs_article, relation_created = SubscriptionArticle.get_or_create(
                        subscription=item['subscription_id'],
                        article=article.id)
                    if not relation_created:
                        logger.info(
                            'relation between article_url [ID:%s] and index_url [ID:%s] has been ignored',
                            subs_article.article, subs_article.subscription)
                    else:
                        logger.info(
                            'article_url [ID:%s] has created a new relationship with index_url [ID:%s]',
                            subs_article.article, subs_article.subscription)
            except (RuntimeError, KeyError, NameError) as e:
                logger.error('%s happened when handling %s', str(e),
                             item['article_url'])
                raise RuntimeError('Error received from Scrapy Pipelines')
Esempio n. 6
0
def _feature_extract(problem_name,
                     filelist,
                     path,
                     problematic_filename,
                     silent,
                     with_existed_merged_file=False):

    problematic_files = []
    for file in filelist:
        cleaner.quote_cleaner(file, file + QUOTE_CLEAN_SUFFIX)
        if silent:
            try:
                _feature_extraction(problem_name, file + QUOTE_CLEAN_SUFFIX,
                                    path, with_existed_merged_file)
            except Exception:
                problematic_files.append(file + QUOTE_CLEAN_SUFFIX)
        else:
            _feature_extraction(problem_name, file + QUOTE_CLEAN_SUFFIX, path,
                                with_existed_merged_file)

    if problematic_filename != None:
        problematic_filename = os.path.join(LOG_DATA_PATH,
                                            problematic_filename)
        if problematic_filename.endswith(QUOTE_CLEAN_SUFFIX):
            problematic_filename = problematic_filename[:-len(
                QUOTE_CLEAN_SUFFIX)]
        f = open(problematic_filename, 'w')
        for file in problematic_files:
            logger.error(
                '========================PROBLEM===========================')
            logger.error(file)
            f.write(file)
            f.write('\n')
        f.close()
Esempio n. 7
0
 def addInPreviousSublist(self, list, element):
     if len(list) == 0 and self.counter == 0:
         list.append([element])
     elif len(list) >= self.counter - 1:
         list[self.counter - 1].append(element)
     else:
         logger.error("ERROR: list should be longer")
Esempio n. 8
0
 def _get_default(self, doctype, feature, fallback):
     """Returns a default for a particular feature given a document
     type. Return the fallback is no value is available"""
     try:
         return DSI_DEFAULTS[doctype][feature]
     except KeyError:
         logger.error("No default %s for document type %s" % (feature, doctype))
         return fallback
Esempio n. 9
0
 def _get_default(self, doctype, feature, fallback):
     """Returns a default for a particular feature given a document
     type. Return the fallback is no value is available"""
     try:
         return DSI_DEFAULTS[doctype][feature]
     except KeyError:
         logger.error("No default %s for document type %s" % (feature, doctype))
         return fallback
Esempio n. 10
0
 def __init__(self, pipeline=None, dom_node=None):
     """Initialize from a pipeline or a DOM node."""
     if pipeline is not None:
         self._initialize_from_pipeline(pipeline)
     elif dom_node is not None:
         self._initialize_from_dom_node(dom_node)
     else:
         logger.error("ProcessingStep cannot be initialized")
Esempio n. 11
0
 def _write_output(self):
     """Write the TarsqiDocument to the output file."""
     if self.options.trap_errors:
         try:
             self.tarsqidoc.print_all(self.output)
         except:
             logger.error("Writing output failed")
     else:
         self.tarsqidoc.print_all(self.output)
Esempio n. 12
0
File: tarsqi.py Progetto: tarsqi/ttk
 def _write_output(self):
     """Write the TarsqiDocument to the output file."""
     if self.options.trap_errors:
         try:
             self.tarsqidoc.print_all(self.output)
         except:
             logger.error("Writing output failed")
     else:
         self.tarsqidoc.print_all(self.output)
Esempio n. 13
0
 def createTLinksFromSLinks(self):
     """Calls lookForStlinks for a given Slink object."""
     logger.debug("Number of SLINKs in file: " + str(len(self.slinks)))
     for slinkTag in self.slinks:
         try:
             slink = Slink(self.xmldoc, self.doctree, slinkTag)
             slink.match_rules(self.rules)
         except:
             logger.error("Error processing SLINK")
Esempio n. 14
0
def check_trailing_cruft(ignore_paths=[], exit=True):
    """
    Recursively finds all files relative to CWD and checks them for trailing whitespace and newlines

    :param ignore_paths: list of paths to ignore during checks
    :return:
    """
    filenames = []
    pruned_filenames = []
    found_error = False
    valid_extensions = ['py', 'yml', 'rb']
    for root, dirs, files in os.walk('.'):
        # gets ./subdirectory/filename
        filenames.extend([os.path.join(root, name) for name in files
                          if name.split(os.extsep)[-1] in valid_extensions])
        # gets ./filename
        filenames.extend([os.path.join(root, name) for name in dirs
                          if name.split(os.extsep)[-1] in valid_extensions])

    # only work on files not in our ignore paths
    for f in filenames:
        f_parts = f.split(os.sep)

        try:
            if f_parts[1] in ignore_paths:
                continue
        except IndexError:
            continue

        # don't add directories
        if os.path.isfile(f):
            pruned_filenames.append(f)

    for filename in pruned_filenames:
        # don't process blank files
        if os.path.getsize(filename) < 1:
            continue

        data = [line for line in open(filename, 'r')]
        newline = trailing_newline(data)
        whitespace = trailing_whitespace(data)

        if newline:
            error = '{}Trailing newline found at the end of {}{}\n'
            logger.error(error.format(colorama.Fore.RED, filename,
                                      colorama.Fore.RESET))
            found_error = True

        if whitespace:
            error = '{}Trailing whitespace found in {} on lines: {}{}\n'
            lines = ', '.join(str(x) for x in whitespace)
            logger.error(error.format(colorama.Fore.RED, filename, lines,
                                      colorama.Fore.RESET))
            found_error = True

    if exit and found_error:
        sys.exit(1)
Esempio n. 15
0
 def createTLinksFromSLinks(self):
     """Calls lookForStlinks for a given Slink object."""
     logger.debug("Number of SLINKs in file: "+str(len(self.slinks)))
     for slinkTag in self.slinks:
         try:
             slink = Slink(self.xmldoc, self.doctree, slinkTag)
             slink.match_rules(self.rules)
         except:
             logger.error("Error processing SLINK")
Esempio n. 16
0
    def apply_component(self, name, wrapper, infile, outfile):

        """Apply a component if the processing parameters determine that the
        component needs to be applied. This method passes the content
        tag and the xml_document to the wrapper of the component and
        asks the wrapper to process the document fragments. 

        Component-level errors are trapped here if trap_errors is True.

        Arguments:
           name - string, the name of the component
           wrapper - instance of a subclass of ComponentWrapper
           infile - string
           outfile - string

        Return value: None"""

        # NOTES
        
        # - Components still write results to file, which is not
        #   conform to the specs. But writing files to disk is but a
        #   minor part of processing time so for now we'll leave it
        #   here and let all components assume that there is an input
        #   file to work with.

        # - Having said that, it is not quite true that the wrappers
        #   use the input file. The wrappers use the xml document and
        #   the content tag and then (i) create fragments from the xml
        #   doc, (ii) process the fragments, (iii) reinsert the
        #   fragments in the xml doc, and (iv) write the xml doc to a
        #   file. But the file rated is not opened by the next
        #   wrapper.

        # - Errors are now trapped here instead of in the component
        #   since we do not tell the component what the output file
        #   is.

        def call_wrapper(wrapper, content_tag, xmldoc, trap_errors, outfile):
            wrapper(content_tag, xmldoc, self).process()
            self.xml_document.save_to_file(outfile)

        logger.info("RUNNING " + name + " on: " + infile)
        #logger.out('Running', name)
        trap_errors = self.getopt_trap_errors()
        if trap_errors:
            try:
                call_wrapper(wrapper, self.content_tag, self.xml_document,
                             trap_errors, outfile)
            except:
                logger.error(name + " error on " + infile + "\n\t"
                             + str(sys.exc_type) + "\n\t"
                             + str(sys.exc_value) + "\n")
                shutil.copy(infile, outfile)
        else:
            call_wrapper(wrapper, self.content_tag, self.xml_document,
                         trap_errors, outfile)
Esempio n. 17
0
 def get_event_attribute(self, attr, optional=False):
     """Return the value of an attribute 'attr' from self.dict. If the attribute is
     not in the dictionary, then (i) return a default value if there is one,
     and (ii) write an error if the attribute is not optional."""
     val = self.dict.get(attr)
     if val is None and not optional:
         logger.error("No %s attribute for current event" % attr)
     if val is None and attr == LIBRARY.timeml.POL:
         val = 'POS'
     return val
Esempio n. 18
0
    def apply_component(self, name, wrapper, infile, outfile):
        """Apply a component if the processing parameters determine that the
        component needs to be applied. This method passes the content
        tag and the xml_document to the wrapper of the component and
        asks the wrapper to process the document fragments. 

        Component-level errors are trapped here if trap_errors is True.

        Arguments:
           name - string, the name of the component
           wrapper - instance of a subclass of ComponentWrapper
           infile - string
           outfile - string

        Return value: None"""

        # NOTES

        # - Components still write results to file, which is not
        #   conform to the specs. But writing files to disk is but a
        #   minor part of processing time so for now we'll leave it
        #   here and let all components assume that there is an input
        #   file to work with.

        # - Having said that, it is not quite true that the wrappers
        #   use the input file. The wrappers use the xml document and
        #   the content tag and then (i) create fragments from the xml
        #   doc, (ii) process the fragments, (iii) reinsert the
        #   fragments in the xml doc, and (iv) write the xml doc to a
        #   file. But the file rated is not opened by the next
        #   wrapper.

        # - Errors are now trapped here instead of in the component
        #   since we do not tell the component what the output file
        #   is.

        def call_wrapper(wrapper, content_tag, xmldoc, trap_errors, outfile):
            wrapper(content_tag, xmldoc, self).process()
            self.xml_document.save_to_file(outfile)

        logger.info("RUNNING " + name + " on: " + infile)
        #logger.out('Running', name)
        trap_errors = self.getopt_trap_errors()
        if trap_errors:
            try:
                call_wrapper(wrapper, self.content_tag, self.xml_document,
                             trap_errors, outfile)
            except:
                logger.error(name + " error on " + infile + "\n\t" +
                             str(sys.exc_type) + "\n\t" + str(sys.exc_value) +
                             "\n")
                shutil.copy(infile, outfile)
        else:
            call_wrapper(wrapper, self.content_tag, self.xml_document,
                         trap_errors, outfile)
Esempio n. 19
0
def _run_gutime_on_string(input_string):
    """Run the GUTIME Perl script. This takes a string and returns a string."""
    command = ["perl", "TimeTag.pl"]
    pipe = subprocess.PIPE
    close_fds = False if sys.platform == 'win32' else True
    p = subprocess.Popen(command, stdin=pipe, stdout=pipe, stderr=pipe,
                         close_fds=close_fds)
    (result, error) = p.communicate(input_string)
    if error:
        logger.error(error)
    return result
Esempio n. 20
0
 def _initialize_nodes(self):
     """Given the VerbChunk or a list of Tokens, set the nodes variable to
     either the daughters of the VerbChunk or the list of Tokens. Also sets
     node and tokens, where the first one has the VerbChunk or None (this is
     so we can hand the chunk to GramVChunk, following GramChunk behaviour),
     and where the second one is the list of Tokens or None."""
     if self.node:
         self.nodes = self.node.dtrs
     elif self.tokens:
         self.nodes = self.tokens
     else:
         logger.error("Incorrect initialization of GramVChunkList")
Esempio n. 21
0
 def _initialize_nodes(self):
     """Given the VerbChunk or a list of Tokens, set the nodes variable to
     either the daughters of the VerbChunk or the list of Tokens. Also sets
     node and tokens, where the first one has the VerbChunk or None (this is
     so we can hand the chunk to VChunkFeatures instance, following
     ChunkFeatures behaviour), and where the second one is the list of Tokens
     or None."""
     if self.node:
         self.nodes = self.node.dtrs
     elif self.tokens:
         self.nodes = self.tokens
     else:
         logger.error("Incorrect initialization of VChunkFeaturesList")
Esempio n. 22
0
 def createTLinksFromALinks(self):
     """Calls alink.lookForAtlinks to add Tlinks from Alinks. This is
     rather moronic unfortunately because it will never do anything
     because at the time of application there are no tlinks in the
     document. Needs to be separated out and apply at a later
     processing stage, after all other tlinking."""
     logger.debug("Number of ALINKs in file: "+str(len(self.alinks)))
     for alinkTag in self.alinks:
         try:
             alink = Alink(self.xmldoc, self.doctree, alinkTag)
             alink.lookForAtlinks()
         except:
             logger.error("Error processing ALINK")
Esempio n. 23
0
 def _get_executable(self):
     """Get the TreeTagger executable for the platform."""
     if sys.platform == "win32":
         executable = os.path.join(self.bindir, WINDOWS_EXECUTABLE)
     elif sys.platform == "linux2":
         executable = os.path.join(self.bindir, LINUX_EXECUTABLE)
     elif sys.platform == "darwin":
         executable = os.path.join(self.bindir, MAC_EXECUTABLE)
     else:
         logger.error("No binary for platform %s" % sys.platform)
     if not os.path.isfile(executable):
         logger.error("TreeTagger binary invalid: %s" % executable)
     return executable
Esempio n. 24
0
File: main.py Progetto: mnscholz/ttk
 def process_doctree(self, doctree):
     """Apply all S2T rules to doctree."""
     self.doctree = doctree
     self.docelement = self.doctree.docelement
     events = self.doctree.tarsqidoc.tags.find_tags('EVENT')
     eventsIdx = dict([(e.attrs['eiid'], e) for e in events])
     for slinktag in self.doctree.slinks:
         slink = Slink(self.doctree, eventsIdx, slinktag)
         try:
             slink.match_rules(self.rules)
         except:
             logger.error("S2T Error when processing Slink instance")
     self._add_links_to_docelement()
Esempio n. 25
0
 def createTLinksFromALinks(self):
     """Calls alink.lookForAtlinks to add Tlinks from Alinks. This is
     rather moronic unfortunately because it will never do anything
     because at the time of application there are no tlinks in the
     document. Needs to be separated out and apply at a later
     processing stage, after all other tlinking."""
     logger.debug("Number of ALINKs in file: " + str(len(self.alinks)))
     for alinkTag in self.alinks:
         try:
             alink = Alink(self.xmldoc, self.doctree, alinkTag)
             alink.lookForAtlinks()
         except:
             logger.error("Error processing ALINK")
Esempio n. 26
0
 def _get_executable(self):
     """Get the TreeTagger executable for the platform."""
     if sys.platform == "win32":
         executable = os.path.join(self.bindir, WINDOWS_EXECUTABLE)
     elif sys.platform == "linux2":
         executable = os.path.join(self.bindir, LINUX_EXECUTABLE)
     elif sys.platform == "darwin":
         executable = os.path.join(self.bindir, MAC_EXECUTABLE)
     else:
         logger.error("No binary for platform %s" % sys.platform)
     if not os.path.isfile(executable):
         logger.error("TreeTagger binary invalid: %s" % executable)
     return executable
Esempio n. 27
0
    def __getitem__(self, idx):
        # return the idx-th sample
        try:
            focus_file = np.load(self.data_path + self.index_to_file[idx])
            transform_no = self.index_to_transform[idx]
            bsp_data = focus_file[transform_no]
            label = focus_file[self.label_name]

            return bsp_data, label

        except:
            error('Error loading file!', self.data_path, idx,
                  self.index_to_file[idx])
Esempio n. 28
0
 def __iter__(self):
     for root, subdirs, files in os.walk(self.source):
         for fname in files:
             full_path = os.path.join(root, fname)
             with open(full_path, encoding='utf8') as file:
                 try:
                     text = file.read()
                     text = text.replace('\n', '')
                     words = self.lexicon.review_to_wordlist(utils.to_unicode(text))
                     tag = Constants.extract_tag(full_path)
                     tags = [tag]
                     yield TaggedDocument(words=words, tags=tags)
                 except:
                     logger.error("failed processing file: %s", fname)
Esempio n. 29
0
File: main.py Progetto: mnscholz/ttk
 def _find_alink(self, sentence, event_expr):
     """Try to find an alink with event_expr as the trigger, alinks are created as a side
     effect."""
     evNode = sentence[event_expr.locInSent]
     if evNode is None:
         logger.error("No node found at locInSent=%s" % event_expr.locInSent)
         return
     forwardFSAs = event_expr.alinkingContexts('forward')
     if forwardFSAs:
         alink_created = evNode.find_forward_alink(forwardFSAs)
     if not alink_created:
         backwardFSAs = event_expr.alinkingContexts('backwards')
         if backwardFSAs:
             evNode.find_backward_alink(backwardFSAs)
Esempio n. 30
0
 def get_event_attribute(self, attr, optional=False):
     """Return the value of an attribute from self.dict. If the attribute
     is not in the dictionary, then (i) return a default value, and
     (ii) write an error if the attribute is not optinal.
     Arguments:
        attr - a string
        optional - a boolean"""
     try:
         return self.dict[attr]
     except KeyError:
         if not optional:
             logger.error("No %s attribute for current event" % attr)
         if attr == POL: return 'POS'
         return None
Esempio n. 31
0
    def measure_performance_auc(test_y, result_y, result_y_prob):
        try:
            vacc = metrics.accuracy_score(test_y, result_y)
            # find validation AUC
            if len(np.unique(test_y)) == 2:
                vauc = roc_auc_score(test_y, result_y_prob)
                logger.info('Accurary: {0:.3f} and AUC {1:.3f}'.format(
                    vacc, vauc))
            else:
                vauc = None
                logger.info('Accurary: {0:.3f}'.format(vacc))

            return vacc, vauc
        except:
            logger.error("Error calculating metrics")
Esempio n. 32
0
    def execute(self):
        """
        Executes ansible-galaxy install

        :return: sh.stdout on success, else None
        :return: None
        """
        if self.galaxy is None:
            self.bake()

        try:
            return self.galaxy().stdout
        except sh.ErrorReturnCode as e:
            logger.error('ERROR: {}'.format(e))
            sys.exit(e.exit_code)
Esempio n. 33
0
 def process_doctree(self, doctree):
     """Apply all S2T rules to doctree."""
     self.doctree = doctree
     # For sanity we clean out the tlinks since we are adding new tlinks to
     # the document, if we don't do this we might add some links twice.
     self.doctree.tlinks = []
     self.docelement = self.doctree.docelement
     events = self.doctree.tarsqidoc.tags.find_tags(LIBRARY.timeml.EVENT)
     eventsIdx = dict([(e.attrs['eiid'], e) for e in events])
     for slinktag in self.doctree.slinks:
         slink = Slink(self.doctree, eventsIdx, slinktag)
         try:
             slink.match_rules(self.rules)
         except:
             logger.error("S2T Error when processing Slink instance")
     self._add_links_to_tarsqidoc()
Esempio n. 34
0
    def execute(self, hide_errors=False):
        """
        Executes ansible-playbook

        :returns: exit code if any, output of command as string
        """
        if self.ansible is None:
            self.bake()

        try:
            return None, self.ansible().stdout
        except (sh.ErrorReturnCode, sh.ErrorReturnCode_2) as e:
            if not hide_errors:
                logger.error('ERROR: {}'.format(e))

            return e.exit_code, None
Esempio n. 35
0
def get_tokens_from_sequence(sequence):
    """Given a sequence of elements, collect all the token leaves and return
    them as a list."""
    # TODO: this can probably use get_tokens
    tokens = []
    for item in sequence:
        if item.isToken():
            tokens.append(item)
        elif item.isChunk():
            tokens += get_tokens(item)
        elif item.isEvent():
            tokens.append(item)
        elif item.isTimex():
            tokens += get_tokens(item)
        else:
            logger.error("unknown item type: %s" % item.__class__.__name__)
    return tokens
Esempio n. 36
0
 def _apply_component(self, name, wrapper, tarsqidocument):
     """Apply a component by taking the TarsqDocument, which includes the
     options from the Tarsqi instance, and passing it to the component
     wrapper. Component-level errors are trapped here if --trap-errors is
     True. If errors are trapped, it is still possible that partial results
     were written to the TagRepositories in the TarsqiDocument."""
     logger.info(name + '............')
     t1 = time.time()
     if self.options.trap_errors:
         try:
             wrapper(tarsqidocument).process()
         except:
             logger.error("%s error:\n\t%s\n\t%s\n" %
                          (name, sys.exc_type, sys.exc_value))
     else:
         wrapper(tarsqidocument).process()
     logger.info("%s DONE (%.3f seconds)" % (name, time.time() - t1))
Esempio n. 37
0
 def _print_tags(self, fh, tag_group, tags):
     fh.write("<%s>\n" % tag_group)
     for tag in sorted(tags):
         try:
             ttk_tag = tag.as_ttk_tag()
             # This became needed after allowing any text in the value of the
             # form and lemma attribute.
             if isinstance(ttk_tag, str):
                 ttk_tag = unicode(ttk_tag, errors='ignore')
             fh.write("  %s\n" % ttk_tag)
         except UnicodeDecodeError:
             # Not sure why this happened, but there were cases where the
             # result of as_ttk_tag() was a byte string with a non-ascii
             # character. The code in the try clause was changed to prevent
             # the error, but leave the except here just in case.
             logger.error("UnicodeDecodeError on printing a tag.")
     fh.write("</%s>\n" % tag_group)
Esempio n. 38
0
def get_tokens_from_sequence(sequence):
    """Given a sequence of elements, collect all the token leaves and return
    them as a list."""
    # TODO: this can probably use get_tokens
    tokens = []
    for item in sequence:
        if item.isToken():
            tokens.append(item)
        elif item.isChunk():
            tokens += get_tokens(item)
        elif item.isEvent():
            tokens += get_tokens(item)
        elif item.isTimex():
            tokens += get_tokens(item)
        else:
            logger.error("unknown item type: %s" % item.__class__.__name__)
    return tokens
Esempio n. 39
0
File: tarsqi.py Progetto: tarsqi/ttk
 def _apply_component(self, name, wrapper, tarsqidocument):
     """Apply a component by taking the TarsqDocument, which includes the
     options from the Tarsqi instance, and passing it to the component
     wrapper. Component-level errors are trapped here if --trap-errors is
     True. If errors are trapped, it is still possible that partial results
     were written to the TagRepositories in the TarsqiDocument."""
     logger.info(name + '............')
     t1 = time.time()
     if self.options.trap_errors:
         try:
             wrapper(tarsqidocument).process()
         except:
             logger.error("%s error:\n\t%s\n\t%s\n"
                          % (name, sys.exc_type, sys.exc_value))
     else:
         wrapper(tarsqidocument).process()
     logger.info("%s DONE (%.3f seconds)" % (name, time.time() - t1))
Esempio n. 40
0
    def diffbot_article_api(self, query_limit=0):
        resp = []
        exceptions = []
        if query_limit <= 0:
            self.urls = (Article
                         .select()
                         .where(Article.status == 0))
        else:
            self.urls = (Article
                         .select()
                         .where((Article.status == 0) | (Article.status == 1))
                         .order_by(fn.Random())
                         .limit(query_limit))

        for url in self.urls:
            try:
                print(url.article_url)
                response = self.diffbot.request(
                    url.article_url, self.token, 'article')
                pp = pprint.PrettyPrinter(indent=4)
                # print(pp.pprint(response))

                title = response['objects'][0]['title']
                if response['objects'][0]['text'] == "":
                    u = Article.update(
                        title=title,
                        modified_utc=datetime.utcnow(),
                    ).where(Article.id == url.id)
                else:
                    resp.append(response['objects'][0])
                    u = Article.update(
                        status=1,
                        title=title,
                        content=response['objects'][0],
                        modified_utc=datetime.utcnow(),
                    ).where(Article.id == url.id)
                u.execute()

            except Exception as e:
                logger.error('%s happened when handling %s', str(e), url)
                exceptions.append(e)
                continue
        return resp
        if exceptions:
            raise RuntimeError('Error received from Diffbot')
Esempio n. 41
0
 def run_timex_linking(self):
     """Apply the rules that govern relations between TIMEX3 tags. Only
     applies to TIMEX3 tags with type=DATE."""
     # TODO: add a DCT TIMEX tag if it is not in the tags dictionary, but
     # maybe check first whether it is in the dictionary in case we care
     # about duplications (see https://github.com/tarsqi/ttk/issues/10 and
     # https://github.com/tarsqi/ttk/issues/13)
     timexes = self.tarsqidoc.tags.find_tags(TIMEX)
     timexes = [t for t in timexes if t.attrs[TYPE] == 'DATE']
     pairs = _timex_pairs(timexes)
     for timex1, timex2 in pairs:
         if self.tarsqidoc.options.trap_errors:
             try:
                 self._create_timex_link(timex1, timex2)
             except Exception:
                 logger.error("Error linking:\n%s\n%s" % (timex1, timex2))
         else:
             self._create_timex_link(timex1, timex2)
Esempio n. 42
0
    def _run_timex_linking(self):

        """Apply the rules that govern relations between TIMEX3 tags. Only
        applies to TIMEX3 tags with a VAL attribute equal to DATE."""

        timexes = [timex for timex in self.xmldoc.get_tags(TIMEX)
                   if timex.attrs['TYPE'] == 'DATE']
        for t in timexes:
            if t.attrs.get('VAL', None) is None:
                logger.warn("Missing VAL: %s" % t.get_content())
                
        for i in range(len(timexes)):
            for j in range(len(timexes)):
                if i < j:
                    try:
                        self._create_timex_link(timexes[i], timexes[j])
                    except Exception:
                        logger.error("Error in Timex Linking:\n%s\n%s" % \
                                     (timexes[i].get_content(),
                                      timexes[j].get_content()))
Esempio n. 43
0
    def _find_lexically_based_slinks(self, event_expr):

        """Try to find lexically based Slinks using forward, backward and
        reporting FSA paterns. No return value, if an Slink is found,
        it will be created by the chunk that embeds the Slink
        triggering event.
        Arguments:
           event_expr - an EventExpression"""

        evNode = self.currSent[event_expr.locInSent]
        #logger.out('trying slink')
        if evNode is None:
            logger.error("No event node found at locInSent")
            
        forwardFSAs = event_expr.slinkingContexts('forward')
        if forwardFSAs:
            #logger.out('found', len(forwardFSAs[0]), 'groups of forwardFSAs')
            evNode.find_forward_slink(forwardFSAs)
            if evNode.createdLexicalSlink:
                #logger.out('created slink')
                evNode.createdLexicalSlink = 0
                return
            
        backwardFSAs = event_expr.slinkingContexts('backwards')
        if backwardFSAs:
            #logger.out('found', len(backwardFSAs[0]), 'groups of backwardFSAs')
            logger.debug("PROCESS for BACKWARD slinks")
            evNode.find_backward_slink(backwardFSAs)
            if evNode.createdLexicalSlink:
                evNode.createdLexicalSlink = 0
                return
            
        reportingFSAs = event_expr.slinkingContexts('reporting')
        if reportingFSAs:
            #logger.out('found', len(reportingFSAs[0]), 'groups of reportingFSAs')
            logger.debug("PROCESS for REPORTING slinks")
            evNode.find_reporting_slink(reportingFSAs)
            if evNode.createdLexicalSlink:
                evNode.createdLexicalSlink = 0
Esempio n. 44
0
    def setup_docmodel(self, tarsqi_instance):

        """Initialize the document_model and processing_parameters instance
        variables of a TarsqiControl instance, using its data source
        identifier and processing options.
        
        Arguments:
           tarsqi_instance - a TarsqiControl instance
        
        No return value."""

        tarsqi_instance.processing_parameters = ProcessingParameters(tarsqi_instance)
        data_source_identifier = tarsqi_instance.data_source_identifier

        constructor = self.dsi_to_docmodelconstructor.get(data_source_identifier, None)
        try:
            constructor(tarsqi_instance)
        except TypeError, e:
            # log error and use simple-xml as a default
            logger.error("Unknown data source identifier, using simple-xml")
            tarsqi_instance.data_source_identifier = 'simple-xml'
            data_source_identifier = tarsqi_instance.data_source_identifier
            self._setup_docmodel_simple_xml(tarsqi_instance)
Esempio n. 45
0
 def process(self, infile, outfile):
     """Ask the component to process a file fragment. This is the method that is called
     from the component wrappers and it should be overwritten on all subclasses. An
     error is written to the log file if this method is ever executed."""
     logger.error("TarsqiComponent.process() not overridden")