def enter_words_only(page_pk, word_array): transactions_to_commit = StringIO() writer = csv.DictWriter(transactions_to_commit, fields_without_lines, restval="", extrasaction='ignore', lineterminator='\n', delimiter=";", quoting=csv.QUOTE_ALL, quotechar=QUOTE_CHAR, escapechar=ESCAPE_CHAR) word_array = get_word_shapes(word_array) for word in word_array: wkb = word['poly'].hex word_fixed = word_clean(word['text']) #print "data: %s %s %s" % (page_pk, text, wkb) writer.writerow({ 'page_pk': page_pk, 'text': word_fixed, 'bbox': word['bbox'], 'poly': wkb, 'word_num': word['word_num'] }) length = transactions_to_commit.tell() transactions_to_commit.seek(0) sql = "COPY documents_pageword (page_pk, text, bbox, poly, word_num) FROM STDIN delimiter ';' escape '%s' quote '%s' CSV " % ( ESCAPE_CHAR, QUOTE_CHAR) cursor.copy_expert(sql, transactions_to_commit, size=length)
def handle(self, *args, **options): """ test cmd to just get a page with geosgeometries attached """ for d, _, files in os.walk(SAMPLE_FILE_DIR): for i, this_file in enumerate(files): file_path = SAMPLE_FILE_DIR + this_file # ignore files that aren't .html in case any got mixed in there # may want to filter on other criteria here too if file_path.find(".html") > 0: print "Handling %s" % (file_path) parser = document_parser(file_path, encoding='latin-1') first_page = parser.next_document() page = get_words_with_lines_from_page(first_page.getvalue()) page['words'] = get_word_shapes(page['words']) print page
def enter_words_only(page_pk, word_array): transactions_to_commit = StringIO() writer = csv.DictWriter(transactions_to_commit, fields_without_lines, restval="", extrasaction='ignore', lineterminator='\n', delimiter=";", quoting=csv.QUOTE_ALL, quotechar=QUOTE_CHAR, escapechar=ESCAPE_CHAR) word_array = get_word_shapes(word_array) for word in word_array: wkb = word['poly'].hex word_fixed = word_clean(word['text']) #print "data: %s %s %s" % (page_pk, text, wkb) writer.writerow({'page_pk':page_pk, 'word':word_fixed, 'bbox': wkb, 'word_num':word['word_num'] }) length = transactions_to_commit.tell() transactions_to_commit.seek(0) sql = "COPY documents_pageword (page_pk, word, bbox, word_num) FROM STDIN delimiter ';' escape '%s' quote '%s' CSV " % (ESCAPE_CHAR, QUOTE_CHAR) cursor.copy_expert(sql, transactions_to_commit, size=length)
def enter_words(page_pk, word_array): transactions_to_commit = StringIO() writer = csv.DictWriter(transactions_to_commit, fields, restval="", extrasaction='ignore', lineterminator='\n', delimiter=";", quoting=csv.QUOTE_ALL, quotechar=QUOTE_CHAR, escapechar=ESCAPE_CHAR) word_array = get_word_shapes(word_array) #print "Entering words, with word length %s" % (len(word_array)) for word in word_array: wkb = word['poly'].hex #print "data: %s %s %s" % (page_pk, word['text'], wkb) word_fixed = word_clean(word['text']) writer.writerow({'page_pk':page_pk, 'text':word_fixed, 'bbox':word['bbox'], 'poly': wkb, 'word_num':word['word_num'], 'line_num':word['line_num']}) length = transactions_to_commit.tell() ## debug raw sql output for quoting etc issues (ugh) with: #print transactions_to_commit.getvalue() transactions_to_commit.seek(0) sql = "COPY documents_pageword (page_pk, text, bbox, poly, word_num, line_num) FROM STDIN delimiter ';' escape '%s' quote '%s' CSV " % (ESCAPE_CHAR, QUOTE_CHAR) cursor.copy_expert(sql, transactions_to_commit, size=length)