def main(): op = OptionParser() op.add_option('--file', '-f') options, arguments = op.parse_args() # get it from some file, sometimes the xml # is too long for the args and i'm tired of # quoting things. if not options.file: op.error('No xml file') exclude_tags = ['schemaLocation', 'noNamespaceSchemaLocation'] with open(options.file, 'r') as f: xml_as_string = f.read() bp = BagParser(xml_as_string.encode('utf-8'), True, False) if bp.parser.xml is None: sys.stderr.write('Failed xml parse') sys.exit(1) stripped_text = [b[1].split() for b in bp.strip_text(exclude_tags) if b[1]] stripped_text = list(chain.from_iterable(stripped_text)) cleaned_text = [clean(s) for s in stripped_text] bow = strip_identifiers(' '.join([c for c in cleaned_text if c])) print ' '.join([b.encode('utf-8') for b in bow if b]).replace("'", "\'")
def _parse(self): try: parser = BagParser(self.source_xml_as_str, True, False) except Exception as ex: print ex raise ex if not parser or parser.parser.xml is None: raise Exception('failed to parse') for tag, txt in parser.strip_text(): # if it's from a tag we know we nee to exclude if any(t in tag for t in self.tag_excludes): continue if not txt.strip(): continue # do not split if it comes form an identifier field self.texts += ( (tag, t) for t in txt.split() ) if not any(r[1] in tag for r in _rule_set) else [(tag, txt)]
# get some responses responses = session.query(Response).filter( and_(*clauses)).limit(LIMIT).offset(i).all() print 'processing', i, len(responses) for response in responses: cleaned_content = response.cleaned_content # omg. skip the big ones for regex hangs? if len(cleaned_content.encode('utf-8')) / 1048576.0 > 1.: print 'SKIPPING big file', response.id continue # strip the html cruft but ignore the a tags bp = BagParser(cleaned_content.encode('utf-8'), True, False) if bp.parser.xml is None: print 'NOT XML: ', cleaned_content[:100] continue # we don't care about the fully qualified namespace here stripped_text = [b[1].split() for b in bp.strip_text(exclude_tags) if b[1]] stripped_text = list(chain.from_iterable(stripped_text)) cleaned_text = [s for s in stripped_text if clean(s)] bow = strip_identifiers(' '.join(cleaned_text)) bag = BagOfWords( generated_on=datetime.now().isoformat(), bag_of_words=bow, method="basic", response_id=response.id