Beispiel #1
0
def clean_corpus(inputfile):
	print("Script for cleaning raw text input")
	c = Counter()
	all_chars = set()
	
	output_file    = os.path.join(OUT_DIR, FileTool.getfilename(inputfile) + '.cleaned.txt')
	output_numfile = os.path.join(OUT_DIR, FileTool.getfilename(inputfile) + '.num.txt')
	print("Input file            : %s" % (inputfile))
	print("Output file           : %s" % (output_file))
	print("Output (numbered) file: %s" % (output_numfile))
	
	with open(inputfile, 'r', encoding='utf8') as infile, open(output_file, 'w', encoding='utf8') as outfile, open(output_numfile, 'w', encoding='utf8') as outnumfile: 
		for linenum, line in enumerate(infile):
			c.count("Line")
			cleaned_line = remove_numbering(line)
			cleaned_line = remove_special_chars(cleaned_line)
			for a_char in cleaned_line:
				all_chars.add(a_char)
			outfile.write("%s\n" % cleaned_line)
			outnumfile.write("%s\t%s\n" % (linenum+1, cleaned_line))
		c.summarise()
	print("-" * 80)
	try:
		print("All characters: %s" % str(sorted(list(all_chars))))
	except:
		pass
	print("Done!")
Beispiel #2
0
def clean_corpus(inputfile):
    print("Script for cleaning raw text input")
    c = Counter()
    all_chars = set()
    
    output_file    = os.path.join(OUT_DIR, FileTool.getfilename(inputfile) + '.cleaned.txt')
    output_numfile = os.path.join(OUT_DIR, FileTool.getfilename(inputfile) + '.num.txt')
    print("Input file            : %s" % (inputfile))
    print("Output file           : %s" % (output_file))
    print("Output (numbered) file: %s" % (output_numfile))
    
    with open(inputfile, 'r', encoding='utf8') as infile, open(output_file, 'w', encoding='utf8') as outfile, open(output_numfile, 'w', encoding='utf8') as outnumfile: 
        for linenum, line in enumerate(infile):
            c.count("Line")
            cleaned_line = remove_numbering(line)
            cleaned_line = remove_special_chars(cleaned_line)
            for a_char in cleaned_line:
                all_chars.add(a_char)
            outfile.write("%s\n" % cleaned_line)
            outnumfile.write("%s\t%s\n" % (linenum+1, cleaned_line))
        c.summarise()
    print("-" * 80)
    try:
        print("All characters: %s" % str(sorted(list(all_chars))))
    except:
        pass
    print("Done!")
Beispiel #3
0
def dev(client, page):
    dirs = [FileTool.abspath('~/Pictures/')]
    print(is_cached('test.jpg', dirs))

    # backup to DB
    db_path = get_db_file(client)
    with SchemaImgur(db_path) as db:
        if not os.path.isfile(db_path) or os.path.getsize(db_path) == 0:
            db.create()
        imgs = client.backup_myfavs(page)
        for img in imgs:
            img_row = db.image.select_single(where='link = ?',
                                             values=[img.link])
            if img_row:
                # update?
                print(
                    "This link is ignored because it exists in current database: %s"
                    % (img.link))
                pass
            else:
                print("Saving: %s" % (img.link))
                db.image.insert(
                    [img.title, img.description, img.datetime, img.link])
        db.ds().commit()
    pass
Beispiel #4
0
def dev(client, page):
    dirs = [ FileTool.abspath('~/Pictures/') ]
    print(is_cached('test.jpg', dirs))
    
    # backup to DB
    db_path = get_db_file(client)
    with SchemaImgur(db_path) as db:
        if not os.path.isfile(db_path) or os.path.getsize(db_path) == 0:
            db.create()
        imgs = client.backup_myfavs(page)
        for img in imgs:
            img_row = db.image.select_single(where='link = ?', values=[img.link])
            if img_row:
                # update?
                print("This link is ignored because it exists in current database: %s" % (img.link))
                pass
            else:
                print("Saving: %s" % (img.link))
                db.image.insert([img.title, img.description, img.datetime, img.link])
        db.ds().commit()
    pass
Beispiel #5
0
    note here is that this script will be used in the other examples so
    set up a test user with API credentials and set them up in auth.ini.
'''

import os
import argparse
import sys
from igui import AuthForm
from auth import SimpleImgurClient

from chirptext.leutile import FileTool
from puchikarui import *

#------------------------------------------------------------------------------

STORE_DIR = FileTool.abspath('./dirs.txt')

#------------------------------------------------------------------------------


class SchemaImgur(Schema):
    def __init__(self, data_source=None):
        Schema.__init__(self, data_source)
        self.add_table('image', 'title description datetime link'.split())

    def create(self):
        self.ds().executescript('''
        -- DROP TABLE IF EXISTS image; 
        CREATE TABLE IF NOT EXISTS image(title, description, datetime, link PRIMARY KEY);
        ''')
Beispiel #6
0
    note here is that this script will be used in the other examples so
    set up a test user with API credentials and set them up in auth.ini.
'''

import os
import argparse
import sys
from igui import AuthForm
from auth import SimpleImgurClient

from chirptext.leutile import FileTool
from puchikarui import *

#------------------------------------------------------------------------------

STORE_DIR = FileTool.abspath('./dirs.txt')

#------------------------------------------------------------------------------

class SchemaImgur(Schema):
    def __init__(self, data_source=None):
        Schema.__init__(self, data_source)
        self.add_table('image', 'title description datetime link'.split())
    
    def create(self):
        self.ds().executescript('''
        -- DROP TABLE IF EXISTS image; 
        CREATE TABLE IF NOT EXISTS image(title, description, datetime, link PRIMARY KEY);
        ''')

def is_cached(filename, dirs):