Exemple #1
0
def enter_words_only(page_pk, word_array):
    transactions_to_commit = StringIO()
    writer = csv.DictWriter(transactions_to_commit,
                            fields_without_lines,
                            restval="",
                            extrasaction='ignore',
                            lineterminator='\n',
                            delimiter=";",
                            quoting=csv.QUOTE_ALL,
                            quotechar=QUOTE_CHAR,
                            escapechar=ESCAPE_CHAR)

    word_array = get_word_shapes(word_array)

    for word in word_array:
        wkb = word['poly'].hex
        word_fixed = word_clean(word['text'])
        #print "data: %s %s %s" % (page_pk, text, wkb)
        writer.writerow({
            'page_pk': page_pk,
            'text': word_fixed,
            'bbox': word['bbox'],
            'poly': wkb,
            'word_num': word['word_num']
        })

    length = transactions_to_commit.tell()
    transactions_to_commit.seek(0)
    sql = "COPY documents_pageword (page_pk, text, bbox, poly, word_num) FROM STDIN delimiter ';' escape '%s' quote '%s' CSV " % (
        ESCAPE_CHAR, QUOTE_CHAR)
    cursor.copy_expert(sql, transactions_to_commit, size=length)
 def handle(self, *args, **options):
     """ test cmd to just get a page with geosgeometries attached """
     
     for d, _, files in os.walk(SAMPLE_FILE_DIR):
         for i, this_file in enumerate(files):
             file_path = SAMPLE_FILE_DIR + this_file
             # ignore files that aren't .html in case any got mixed in there
             # may want to filter on other criteria here too
             if file_path.find(".html") > 0:
                 print "Handling %s" % (file_path)
     
                 parser = document_parser(file_path, encoding='latin-1')
                 first_page = parser.next_document()
                 page = get_words_with_lines_from_page(first_page.getvalue())
                 page['words'] = get_word_shapes(page['words'])
                 print page
Exemple #3
0
def enter_words_only(page_pk, word_array):
    transactions_to_commit = StringIO()
    writer = csv.DictWriter(transactions_to_commit, fields_without_lines, restval="", extrasaction='ignore', lineterminator='\n', delimiter=";", quoting=csv.QUOTE_ALL, quotechar=QUOTE_CHAR, escapechar=ESCAPE_CHAR)
    
    word_array = get_word_shapes(word_array)
    
    for word in word_array:
        wkb = word['poly'].hex
        word_fixed = word_clean(word['text'])
        #print "data: %s %s %s" % (page_pk, text, wkb)
        writer.writerow({'page_pk':page_pk, 'word':word_fixed, 'bbox': wkb, 'word_num':word['word_num'] })
    
    length = transactions_to_commit.tell()
    transactions_to_commit.seek(0)
    sql = "COPY documents_pageword (page_pk, word, bbox, word_num) FROM STDIN delimiter ';' escape '%s' quote '%s' CSV " % (ESCAPE_CHAR, QUOTE_CHAR)
    cursor.copy_expert(sql, transactions_to_commit, size=length)    
Exemple #4
0
def enter_words(page_pk, word_array):
    transactions_to_commit = StringIO()
    writer = csv.DictWriter(transactions_to_commit, fields, restval="", extrasaction='ignore', lineterminator='\n', delimiter=";", quoting=csv.QUOTE_ALL, quotechar=QUOTE_CHAR, escapechar=ESCAPE_CHAR)

    word_array = get_word_shapes(word_array)
    #print "Entering words, with word length %s" % (len(word_array))
    

    for word in word_array:
        wkb = word['poly'].hex
        #print "data: %s %s %s" % (page_pk, word['text'], wkb)
        word_fixed = word_clean(word['text'])
        writer.writerow({'page_pk':page_pk, 'text':word_fixed, 'bbox':word['bbox'], 'poly': wkb, 'word_num':word['word_num'], 'line_num':word['line_num']})

    length = transactions_to_commit.tell()
    
    ## debug raw sql output for quoting etc issues (ugh) with:
    #print transactions_to_commit.getvalue()
    
    transactions_to_commit.seek(0)
    sql = "COPY documents_pageword (page_pk, text, bbox, poly, word_num, line_num) FROM STDIN delimiter ';' escape '%s' quote '%s' CSV " % (ESCAPE_CHAR, QUOTE_CHAR)
    cursor.copy_expert(sql, transactions_to_commit, size=length)