コード例 #1
0
def main():
    op = OptionParser()
    op.add_option('--file', '-f')

    options, arguments = op.parse_args()

    # get it from some file, sometimes the xml
    # is too long for the args and i'm tired of
    # quoting things.
    if not options.file:
        op.error('No xml file')

    exclude_tags = ['schemaLocation', 'noNamespaceSchemaLocation']

    with open(options.file, 'r') as f:
        xml_as_string = f.read()

    bp = BagParser(xml_as_string.encode('utf-8'), True, False)
    if bp.parser.xml is None:
        sys.stderr.write('Failed xml parse')
        sys.exit(1)

    stripped_text = [b[1].split() for b in bp.strip_text(exclude_tags) if b[1]]
    stripped_text = list(chain.from_iterable(stripped_text))
    cleaned_text = [clean(s) for s in stripped_text]
    bow = strip_identifiers(' '.join([c for c in cleaned_text if c]))
    print ' '.join([b.encode('utf-8') for b in bow if b]).replace("'", "\'")
コード例 #2
0
    def _parse(self):
        try:
            parser = BagParser(self.source_xml_as_str, True, False)
        except Exception as ex:
            print ex
            raise ex
        if not parser or parser.parser.xml is None:
            raise Exception('failed to parse')

        for tag, txt in parser.strip_text():
            # if it's from a tag we know we nee to exclude
            if any(t in tag for t in self.tag_excludes):
                continue

            if not txt.strip():
                continue

            # do not split if it comes form an identifier field
            self.texts += (
                (tag, t) for t in txt.split()
            ) if not any(r[1] in tag for r in _rule_set) else [(tag, txt)]
コード例 #3
0
    # get some responses
    responses = session.query(Response).filter(
        and_(*clauses)).limit(LIMIT).offset(i).all()
    
    print 'processing', i, len(responses)
    
    for response in responses:
        cleaned_content = response.cleaned_content

        # omg. skip the big ones for regex hangs?
        if len(cleaned_content.encode('utf-8')) / 1048576.0 > 1.:
            print 'SKIPPING big file', response.id
            continue
        
        # strip the html cruft but ignore the a tags
        bp = BagParser(cleaned_content.encode('utf-8'), True, False)
        if bp.parser.xml is None:
            print 'NOT XML: ', cleaned_content[:100]
            continue
        # we don't care about the fully qualified namespace here
        stripped_text = [b[1].split() for b in bp.strip_text(exclude_tags) if b[1]]
        stripped_text = list(chain.from_iterable(stripped_text))
        cleaned_text = [s for s in stripped_text if clean(s)]

        bow = strip_identifiers(' '.join(cleaned_text))
        
        bag = BagOfWords(
            generated_on=datetime.now().isoformat(),
            bag_of_words=bow,
            method="basic",
            response_id=response.id