Example #1
0
def process_parallel(p_conf, keys, univ_conf, connection):
    fs = {}
    csvr = {}
    generators = []
    used_keys = set()
    table_classes = {}
    writers = {}
    cursor = connection.cursor()
    try:
        for table, table_conf in p_conf['tables'].iteritems():
            table_class = Table(table_conf, univ_conf, cursor)
            table_classes[table] = table_class
            used_keys.update((v for (k,v) in table_classes[table].key_columns))
            if not fs.has_key(table_conf['filename']):
                fs[table_conf['filename']] = utffile(table_conf['filename'], 'rU') if univ_conf['use_utf'] else open(table_conf['filename'],'rU')
            if not csvr.has_key(table_conf['filename']):
                csvr[table_conf['filename']] = csv.reader(fs[table_conf['filename']], quotechar=table_class.quote_char, delimiter=table_class.field_sep)
                if table_conf.has_key('skip_head_lines'):
                    shl = int(table_conf['skip_head_lines'])
                    for i in range(shl):
                        csvr[table_conf['filename']].next()
                generators.append(((table_conf['filename'],l) for l in csvr[table_conf['filename']]))
        for lines in izip(*generators):
            lines = dict(lines)
            key_values = create_keys(used_keys, keys, univ_conf['key_sources'])
            for table, table_conf in p_conf['tables'].iteritems():
                table_classes[table].writer.send((lines[table_conf['filename']],key_values))
        for table in p_conf['tables']:
            table_classes[table].writer.close()
    finally:
        for f in fs.values():
            f.close()
Example #2
0
def process_parallel(p_conf, keys, univ_conf, connection):
    fs = {}
    csvr = {}
    generators = []
    used_keys = set()
    table_classes = {}
    writers = {}
    cursor = connection.cursor()
    try:
        for table, table_conf in p_conf['tables'].iteritems():
            table_class = Table(table_conf, univ_conf, cursor)
            table_classes[table] = table_class
            used_keys.update((v for (k,v) in table_classes[table].key_columns))
            if not fs.has_key(table_conf['filename']):
                fs[table_conf['filename']] = utffile(table_conf['filename'], 'rU') if univ_conf['use_utf'] else open(table_conf['filename'],'rU')
            if not csvr.has_key(table_conf['filename']):
                csvr[table_conf['filename']] = csv.reader(fs[table_conf['filename']], quotechar=table_class.quote_char, delimiter=table_class.field_sep)
                if table_conf.has_key('skip_head_lines'):
                    shl = int(table_conf['skip_head_lines'])
                    for i in range(shl):
                        csvr[table_conf['filename']].next()
                generators.append(((table_conf['filename'],l) for l in csvr[table_conf['filename']]))
        for lines in izip(*generators):
            lines = dict(lines)
            key_values = create_keys(used_keys, keys, univ_conf['key_sources'])
            for table, table_conf in p_conf['tables'].iteritems():
                table_classes[table].writer.send((lines[table_conf['filename']],key_values))
        for table in p_conf['tables']:
            table_classes[table].writer.close()
    finally:
        for f in fs.values():
            f.close()
Example #3
0
def process_table(table_conf, univ_conf, connection):
    cursor = connection.cursor()
    table_class = Table(table_conf,univ_conf, cursor)
    with utffile(table_conf['filename'],'rU') if univ_conf['use_utf'] else open(table_conf['filename'], 'rU') as f:
        csvr = csv.reader(f, quotechar=table_class.quote_char, delimiter=table_class.field_sep)
        if table_conf.has_key('skip_head_lines'):
            shl = int(table_conf['skip_head_lines'])
            for i in range(shl):
                csvr.next()
        for l in csvr:
            table_class.writer.send((l,None))
        table_class.writer.close()
Example #4
0
def process_table(table_conf, univ_conf, connection):
    cursor = connection.cursor()
    table_class = Table(table_conf,univ_conf, cursor)
    with utffile(table_conf['filename'],'rU') if univ_conf['use_utf'] else open(table_conf['filename'], 'rU') as f:
        csvr = csv.reader(f, quotechar=table_class.quote_char, delimiter=table_class.field_sep)
        if table_conf.has_key('skip_head_lines'):
            shl = int(table_conf['skip_head_lines'])
            for i in range(shl):
                csvr.next()
        for l in csvr:
            table_class.writer.send((l,None))
        table_class.writer.close()
Example #5
0
def process_parallel(p_conf, keys, univ_conf, connection):
    """Process tables in parallel from a selection of files
    takes:
        p_conf - a dict with a 'tables' entry mapping the names of the tables to process in parallel to their table configs
        keys - a dict of key names to key sources
        univ_conf - the universal config
        connection - a psycopg connection object
    Iterates through the table configs creating Table objects, collecting the keys used by the tables, and opening the files needed by the tables. Each file is associated with a csv reader (TODO: figure out how to choose DictReader based on all tables associated with the file's needs) which is then placed in a generator that generates tuples of (filename, row). Some quick izip usage provides iteration through tuples of the form ((filename1,row1),(filename2,row2)...) which can then be converted to a dict providing each table with a way to access its file's row without having to go through each file sequentially. This could potentially be used to load different columns of a table from different files in parallel. Each Table grabs the row it needs and sends it to its _sql_writer coroutine

    """
    fs = {}
    csvr = {}
    generators = []
    used_keys = set()
    table_classes = {}
    writers = {}
    cursor = connection.cursor()
    try:
        for table, table_conf in p_conf['tables'].iteritems():
            table_class = Table(table_conf, univ_conf, cursor)
            table_classes[table] = table_class
            used_keys.update((v for (k,v) in table_classes[table].key_columns))
            if not fs.has_key(table_conf['filename']):
                fs[table_conf['filename']] = utffile(table_conf['filename'], 'rU') if univ_conf['use_utf'] else open(table_conf['filename'],'rU')
            if not csvr.has_key(table_conf['filename']):
                if table_conf['dict_reader']:
                    csvr[table_conf['filename']] = csv.DictReader(fs[table_conf['filename']],quotechar=table_class.quote_char, delimiter=table_class.field_sep)
                else:
                    csvr[table_conf['filename']] = csv.reader(fs[table_conf['filename']], quotechar=table_class.quote_char, delimiter=table_class.field_sep)
                    if table_conf.has_key('skip_head_lines'):
                        shl = int(table_conf['skip_head_lines'])
                        for i in range(shl):
                            csvr[table_conf['filename']].next()
                generators.append(((table_conf['filename'],l) for l in csvr[table_conf['filename']]))
            elif isinstance(csvr[table_conf['filename']], csv.DictReader) != table_conf['dict_reader']:
                raise Exception('Improper Configuration: {table} is configured to {not1} use csv.DictReader but another table loaded from the same file is configured to {not2} use csv.DictReader. (the default is to not use it, so it must be specified on every table config for that file)'.format(table=table_conf['table'],not1=('not' if table_conf['dict_reader'] else ''), not2=('' if table_conf['dict_reader'] else 'not')))

        for lines in izip(*generators):
            lines = dict(lines)
            key_values = _create_keys(used_keys, keys, univ_conf['key_sources'])
            for table, table_conf in p_conf['tables'].iteritems():
                table_classes[table].writer.send((lines[table_conf['filename']],key_values))
        for table in p_conf['tables']:
            table_classes[table].writer.close()
    finally:
        for f in fs.values():
            f.close()
Example #6
0
def process_table(table_conf, univ_conf, connection):
    """The simple, non parallel table processor
    takes:
        table_conf - a table config dict
        univ_conf - the univesal config dict
        connection - a psycopg connection object
    Creates a Table from the table_conf, opens the file associated with this table, creates a csv reader for it and iterates through the file sending the rows to the Table's _sql_writer coroutine

    """
    cursor = connection.cursor()
    table_class = Table(table_conf,univ_conf, cursor)
    with utffile(table_conf['filename'],'rU') if univ_conf['use_utf'] else open(table_conf['filename'], 'rU') as f:
        if table_conf['dict_reader']:
            csvr = csv.DictReader(f,quotechar=table_class.quote_char,delimiter=table_class.field_sep)
        else:
            csvr = csv.reader(f, quotechar=table_class.quote_char, delimiter=table_class.field_sep)
            if table_conf.has_key('skip_head_lines'):
                shl = int(table_conf['skip_head_lines'])
                for i in range(shl):
                    csvr.next()
        for l in csvr:
            table_class.writer.send((l,None))
        table_class.writer.close()
"""
data_train = fetch_20newsgroups(subset='train', categories=categories,
                               shuffle=True, random_state=42)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                              shuffle=True, random_state=42)
"""
print 'data loaded'
import conversions as conv
from utffile import utffile
special_terms = []
vocabulary = []
basic_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, use_idf=False,
                             stop_words='english')
basic_analyze = basic_vectorizer.build_analyzer()
with utffile('searchterms.csv') as f:
    for s in f:
        if s.startswith('<'):
            special_terms.append(s.strip('<>'))
        else:
            vocabulary.append(s.decode('utf-8').strip())

def analyze(s):
    d=eval(s)
    special_keys = []
    name = d['name']
    electoral_district_type = d['electoral_district_type']
    electoral_district_name = d['electoral_district_name']
    state = d['state']
    link = d['link']
    text = d['sitetext'].lower().decode('utf-8')