コード例 #1
0
ファイル: cleaning_corpus.py プロジェクト: lngvietthang/omwtk
def clean_corpus(inputfile):
	print("Script for cleaning raw text input")
	c = Counter()
	all_chars = set()
	
	output_file    = os.path.join(OUT_DIR, FileTool.getfilename(inputfile) + '.cleaned.txt')
	output_numfile = os.path.join(OUT_DIR, FileTool.getfilename(inputfile) + '.num.txt')
	print("Input file            : %s" % (inputfile))
	print("Output file           : %s" % (output_file))
	print("Output (numbered) file: %s" % (output_numfile))
	
	with open(inputfile, 'r', encoding='utf8') as infile, open(output_file, 'w', encoding='utf8') as outfile, open(output_numfile, 'w', encoding='utf8') as outnumfile: 
		for linenum, line in enumerate(infile):
			c.count("Line")
			cleaned_line = remove_numbering(line)
			cleaned_line = remove_special_chars(cleaned_line)
			for a_char in cleaned_line:
				all_chars.add(a_char)
			outfile.write("%s\n" % cleaned_line)
			outnumfile.write("%s\t%s\n" % (linenum+1, cleaned_line))
		c.summarise()
	print("-" * 80)
	try:
		print("All characters: %s" % str(sorted(list(all_chars))))
	except:
		pass
	print("Done!")
コード例 #2
0
def clean_corpus(inputfile):
    print("Script for cleaning raw text input")
    c = Counter()
    all_chars = set()
    
    output_file    = os.path.join(OUT_DIR, FileTool.getfilename(inputfile) + '.cleaned.txt')
    output_numfile = os.path.join(OUT_DIR, FileTool.getfilename(inputfile) + '.num.txt')
    print("Input file            : %s" % (inputfile))
    print("Output file           : %s" % (output_file))
    print("Output (numbered) file: %s" % (output_numfile))
    
    with open(inputfile, 'r', encoding='utf8') as infile, open(output_file, 'w', encoding='utf8') as outfile, open(output_numfile, 'w', encoding='utf8') as outnumfile: 
        for linenum, line in enumerate(infile):
            c.count("Line")
            cleaned_line = remove_numbering(line)
            cleaned_line = remove_special_chars(cleaned_line)
            for a_char in cleaned_line:
                all_chars.add(a_char)
            outfile.write("%s\n" % cleaned_line)
            outnumfile.write("%s\t%s\n" % (linenum+1, cleaned_line))
        c.summarise()
    print("-" * 80)
    try:
        print("All characters: %s" % str(sorted(list(all_chars))))
    except:
        pass
    print("Done!")
コード例 #3
0
def dev(client, page):
    dirs = [FileTool.abspath('~/Pictures/')]
    print(is_cached('test.jpg', dirs))

    # backup to DB
    db_path = get_db_file(client)
    with SchemaImgur(db_path) as db:
        if not os.path.isfile(db_path) or os.path.getsize(db_path) == 0:
            db.create()
        imgs = client.backup_myfavs(page)
        for img in imgs:
            img_row = db.image.select_single(where='link = ?',
                                             values=[img.link])
            if img_row:
                # update?
                print(
                    "This link is ignored because it exists in current database: %s"
                    % (img.link))
                pass
            else:
                print("Saving: %s" % (img.link))
                db.image.insert(
                    [img.title, img.description, img.datetime, img.link])
        db.ds().commit()
    pass
コード例 #4
0
ファイル: imgur.py プロジェクト: dakside/imgurtk
def dev(client, page):
    dirs = [ FileTool.abspath('~/Pictures/') ]
    print(is_cached('test.jpg', dirs))
    
    # backup to DB
    db_path = get_db_file(client)
    with SchemaImgur(db_path) as db:
        if not os.path.isfile(db_path) or os.path.getsize(db_path) == 0:
            db.create()
        imgs = client.backup_myfavs(page)
        for img in imgs:
            img_row = db.image.select_single(where='link = ?', values=[img.link])
            if img_row:
                # update?
                print("This link is ignored because it exists in current database: %s" % (img.link))
                pass
            else:
                print("Saving: %s" % (img.link))
                db.image.insert([img.title, img.description, img.datetime, img.link])
        db.ds().commit()
    pass
コード例 #5
0
    note here is that this script will be used in the other examples so
    set up a test user with API credentials and set them up in auth.ini.
'''

import os
import argparse
import sys
from igui import AuthForm
from auth import SimpleImgurClient

from chirptext.leutile import FileTool
from puchikarui import *

#------------------------------------------------------------------------------

STORE_DIR = FileTool.abspath('./dirs.txt')

#------------------------------------------------------------------------------


class SchemaImgur(Schema):
    def __init__(self, data_source=None):
        Schema.__init__(self, data_source)
        self.add_table('image', 'title description datetime link'.split())

    def create(self):
        self.ds().executescript('''
        -- DROP TABLE IF EXISTS image; 
        CREATE TABLE IF NOT EXISTS image(title, description, datetime, link PRIMARY KEY);
        ''')
コード例 #6
0
ファイル: imgur.py プロジェクト: dakside/imgurtk
    note here is that this script will be used in the other examples so
    set up a test user with API credentials and set them up in auth.ini.
'''

import os
import argparse
import sys
from igui import AuthForm
from auth import SimpleImgurClient

from chirptext.leutile import FileTool
from puchikarui import *

#------------------------------------------------------------------------------

STORE_DIR = FileTool.abspath('./dirs.txt')

#------------------------------------------------------------------------------

class SchemaImgur(Schema):
    def __init__(self, data_source=None):
        Schema.__init__(self, data_source)
        self.add_table('image', 'title description datetime link'.split())
    
    def create(self):
        self.ds().executescript('''
        -- DROP TABLE IF EXISTS image; 
        CREATE TABLE IF NOT EXISTS image(title, description, datetime, link PRIMARY KEY);
        ''')

def is_cached(filename, dirs):