Esempio n. 1
0
def import_data_from_text(data_filename, field_type_filename, table):
    field_types = make_dictionary_from_file(field_type_filename, key_col=0, val_col=1, has_header=False)
    data, field_names = file_to_list(data_filename)
 
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    r = 0
    for i in range(len(data)):
        r+=1
        sql_fields = ''
        sql_values = ''
        if len(data[i]) != len(field_names):
            print 'missing columns row:%d' % (i)
            return
        for j in range(len(field_names)):
            sql_fields += (field_names[j] + ',')
            if field_types[field_names[j]] == "INTEGER":
                if data[i][j] == 'None':
                    sql_values+=('0,')
                else:        
                    sql_values+=(data[i][j] + ',')
            elif field_types[field_names[j]] == "TEXT":
#                c_str = re.sub('"', ' ', str(data[i][j]))
                c_str = data[i][j].replace('"', ' ')
                sql_values+=('"' + c_str + '",')
            elif field_types[field_names[j]] == "FLOAT":
                if data[i][j] == 'None':
                    sql_values+=('0,')
                else:        
                    sql_values+=(data[i][j] + ',')
            else:
                print 'no valid type row:%d' % (i)
                return
            
        sql_fields = sql_fields[:-1]    
        sql_values = sql_values[:-1]
        sql = 'insert into ' + table + ' (' +  sql_fields + ') values (' + sql_values + ')'
#        print sql    
#        try:
        cursor.execute(sql)
 #       except sqlite3.OperationalError:
        print r
 #           x = 4
#       break

    conn.commit()
    cursor.close()

    return
Esempio n. 2
0
def n_gram_chapters(chapter_index_filename, verse_location, n_gram_path):
    directory, file_name, extension = file_name_ext(chapter_index_filename)

    chapters = make_dictionary_from_file(chapter_index_filename, sep='\t', key_col=0)

    files = os.listdir(verse_location)

    for k, v in chapters.items():
        file_filter = k + '*'
        file_match_list = fnmatch.filter(files, file_filter)

        for f in file_match_list:
            verse_text = get_file_text(verse_location + f)
            verse_text = verse_text.replace('.', ' ') #TODO: Make this a function, punctuation replacement
            verse_text = verse_text.replace(',', ' ')
            verse_text = verse_text.replace('\n', ' ')
            verse_words = verse_text.split()
            n_gram = {}
            dic = {} #New
            prev_word = verse_words[0]
            dic[prev_word] = dic.get(prev_word, 0) + 1
            for i in range(1, len(verse_words)):
#        n_gram[prev_word] = n_gram.get(verse_words[prev_word], {})
                n_gram[prev_word] = n_gram.get(prev_word, {})
                n_gram_child = n_gram[prev_word]
                n_gram_child[verse_words[i]] = n_gram_child.get(verse_words[i], 1)
                dic[verse_words[i]] = dic.get(verse_words[i], 0) + 1
                prev_word = verse_words[i]

            directory, file_name, extension = file_name_ext(f)
            output_filename = n_gram_path + file_name + '_n-gram.' + extension
            o_file = open(output_filename, 'w')
            json.dump(n_gram, o_file, sort_keys=True, indent=4)
            o_file.close()

            verse_index = []
            index_lookup = {}
            i = 0
            for k, v in dic.items():
                verse_index.append(k)
                index_lookup[k] = i
                i+=1

            output_filename = n_gram_path + file_name + '_dic_index.' + extension
            o_file = open(output_filename, 'w')

            dic_size = len(verse_index)
            verse_dic = {}
            for i in range(dic_size):
                o_file.write('%s\t%d\n' % (verse_index[i], i))
                verse_dic[verse_index[i]] = i
            o_file.close()

            verse_matrix = np.zeros((dic_size, dic_size), dtype = np.int)
            for k, v in n_gram.items():
                #get row
                row_index = index_lookup[k]
                for k2, v2 in v.items():
                    col_index = index_lookup[k2]
                    verse_matrix[row_index, col_index] = v2

            output_filename = n_gram_path + file_name + '_chord.' + extension
            np.savetxt(output_filename, verse_matrix, delimiter=',', fmt='%d')