Example #1
0
def main():
    
    db_connector = DBConnector('default')   
    curr=db_connector.getCursor()
    conn=db_connector.getConn()
    curr.execute("SELECT id,path FROM files_list where inserted=false and locked=false and pg_try_advisory_lock(tableoid::INTEGER,id) and path like '%contract%' limit 1")
    [(id,path)] = curr.fetchall()

    results=[]
    while id > 0:  
        try:
            dt = datetime.datetime.now()
            curr.execute("update files_list set locked=true ,lock_time=now() where id=%s",(id,))
            conn.commit()
            print "--Try get file" , id , path
            print "--RAISE warning 'Start to read file:%s:%s';" % (id,path)
            dt = datetime.datetime.now()
            parse_file_id(id,path)       
        except Exception as e:
            print "--Poblem!",e
            conn.rollback()
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)
            print(exc_type, fname, exc_tb.tb_lineno)
            traceback.print_exc()
        finally:
            curr.execute("update files_list set locked=false,inserted=true,insert_time=now() where id=%s; select pg_advisory_unlock(tableoid::INTEGER,id) from files_list where id = %s",(id,id))
            conn.commit()
            curr.execute("SELECT id,path FROM files_list where inserted=false and locked=false and pg_try_advisory_lock(tableoid::INTEGER,id) and path like '%contract%' limit 1")
            [(id,path)] = curr.fetchall()
    print "Seems, there is no more to insert."
    print results
Example #2
0
class ZakupkiXMLParser(object):

    def __init__(self):
        self.tables=dict()
        self.values=dict()
        self.uids=dict()
        self.sql_inserts=[]
        try: 
            uids_file = open('uids.pkl', 'rb')
            self.uids = pickle.load(uids_file)
            uids_file.close()
        except:
            pass
        '''
        if CREATE_TABLES==True:
            try:
                tables_file = open('tables.pkl','rb')
                tables=pickle.load(tables_file)
                tables_file.close()
            except:
                pass
        '''
        self.SCHEMA=schema
        self.PK=['uid']
        self.FK_IS=['uid']
        self.FK=['parent_uid']
        self.INTEGER=['month','year','id','versionNumber','kpp','countryCode']
        self.BIGINTEGER=['inn','regNum','uid','parent_uid']
        self.FLOAT=[]
        self.NUMERIC=['price','sum']#'quantity'] Fond that in contracts it can be text
        self.VARCHAR=['']
        self.UNQUOTED=['parent_uid']
        # If need modify column names in output or modify tablenames in output

        self.COLUMN_MAPPING={
        'placing':'pplacing',
        'order':'"order"',
        'from':'"from"',
        }
        self.TABLES_MAPPING={
        'protocol[A-Z]{2,}[0-9]|prot[A-Z]{2,}[0-9]':'prtk',
        'protocol':'prot',
        #'_protocolLot':'_iprotLot',
        'application':'app',
        'admission':'adm',
        'Result':'Res',
        'Commission':'Com',
        'Member':'Mem',
        'Rejected':'Rej',
        'Reason':'Reas',
        'Participant':'Part',
        'Condition':'Cond',
        'evaluation':'ev',
        'requirement':'req',
        'Requirement':'Req',
        'Compliance':'Comp',
        'attributes':'attr',
        'contract':'contr',
        'criterion':'crit',
        #'protocol':'prot',
        'notification[A-Z]{2,}[0-9]?':'notf',
        'notification':'notif',
        'Feature':'Feat',
        '-':'',
        '\.':'',
        }



    def setDbConn(self,dbconn):
        '''Set DB connection'''
        self.dbConn=dbconn
        self.dbConn = psycopg2.connect(dbconn)
        self.curr=self.dbConn.cursor()
        

    def writeToConsole(self,choice=False):
        self.write_to_console=choice

    def writeToDb(self,choice=True):
        self.db_connector = DBConnector('default')
        self.dbConn=self.db_connector.getConn()
        self.curr=self.db_connector.getCursor()
        self.write_to_db=choice

    def output(self,sql):
        if self.write_to_db:
            try:
                self.curr.execute(sql)
                #self.dbConn+'commit;')
            except psycopg2.Error as e:
                self.dbConn.rollback()
                dt = datetime.datetime.now()
                self.curr.execute('insert into import_log (file_id,time,sql,error) values (%s,%s,%s,%s);' ,(self.file_id,dt,sql,e.pgerror))
            finally:
                self.dbConn.commit()
        if self.write_to_console:
            sys.stdout.write(sql)

    def uniq(self,seq):
        '''Get Uniq in seq'''
        seen = set()
        seen_add = seen.add
        return [ x for x in seq if x not in seen and not seen_add(x)]

    def remapColumns(self,columns):
        result=[]
        for i in columns:
            if i in self.COLUMN_MAPPING.keys():
                result.append(self.COLUMN_MAPPING[i])
            else:
                result.append(i)
        return result

    def remapTable(self,table):
        warning=''
        result=''
        wrong_simbols=['-','.']
        for i in self.TABLES_MAPPING.keys():    
            table = re.sub(i, self.TABLES_MAPPING[i], table)
        result=table
        if len(result)>63:
            warning += '--Warning table name too long:' + table + ':'+table[0:63]+'\n'
        if any(s in result for s in wrong_simbols):
            warning += '--Warning wrong simbol in table name:' + result + '\n'
        if len(warning)>0:
            pass
            #sys.stderr.write('\033[93m'+warning+'\033[0m')
        if len(result)>0:
            return result
        else:
            return table



    def handle_element(self,parent,elem):
        #print parent,elem
        if len(elem.xpath("./*"))>0:
            key=parent+'_'+(elem.xpath("local-name()"))
            key=self.remapTable(key)
            if key not in self.tables.keys():
                self.tables[key]=[]
                #tables[parent+'_'+(elem.xpath("local-name()"))]=[]
            if key not in self.values.keys():
                self.values[key]=[]
                #values[parent+'_'+(elem.xpath("local-name()"))]=[]
            #next_element.append(elem)
        else:
            self.tables[parent]+=[elem.xpath("local-name()")]
            column_name=parent+'_'+elem.xpath("local-name()")
            if column_name not in self.values.keys():
                self.values[column_name]=[]
                self.values[column_name]+=[elem.xpath("text()")]
            else:
                self.values[column_name]+=[elem.xpath("text()")]
        for i in elem:
            el = self.remapTable(elem.xpath("local-name()"))
            par=parent+'_'+el
            #par=remapTable(par)
            self.handle_element(par,i)

    def generate_types(self,columns):
        result=[]
        for i in columns:
            if i in self.PK:
                result.append(i+' bigint primary key')
            elif i in self.INTEGER:
                result.append(i+' integer')
            elif i in self.BIGINTEGER:
                result.append(i+' bigint')
            elif i in self.VARCHAR:
                result.append(i+' varchar(2000)')
            elif i in self.FLOAT:
                result.append(i+' float')
            elif i in self.NUMERIC:
                result.append(i+' numeric')
            else:
                result.append(i+' text')
        return result

    def getParent(self,table):
        table_list=table.split('_')
        result=["_"]
        parent="_".join(table_list[0:-1])
        result+=parent
        result="".join(parent)
        return result

    def sortByHierarhy(self,tables_list):
        return tables_list.sort()

    def generate_create_tables(self,tables):
        for i in sorted(self.tables.keys()):
            columns=self.uniq(self.tables[i])
            columns.append(self.PK[0])

            columns=self.remapColumns(columns)
            columns=self.generate_types(columns)

            tab=self.remapTable(i)
            columns=",".join(columns)

            parent = self.getParent(i)
            parent = self.remapTable(parent)

            sql=''
            schema=''

            if len(self.SCHEMA)>0:
                schema=self.SCHEMA+'.'
                if len(parent)==0:
                    sql="create table "+schema + tab +" ("+columns+");"
                else:
                    sql="create table "+schema + tab +" ("+columns+", parent_uid bigint references "+parent+"("+self.FK_IS[0]+")"+" );\n"
                sys.stdout.write(sql)

    def extract_column_for_table(self,table,columns):
        result=[]
        for i in columns:
            prefix = i.split(table)
            if len(prefix)>1 and i not in self.tables.keys():
                if i.split(table)[1][0]=='_' and len(i.split(table)[1].split('_'))-1==1:
                    result+=[i]
        return result

    def extract_column_names(self,raw):
        result=[]
        for i in raw:
            tab = self.getParent(i)
            tmp=i.split(tab)
            result.append(tmp[-1][1:])
        return result

    def print_insert_statements(self,table,columns,values):
        tab=table
        col_names=columns
        curr_val=values
        fields_dict = dict(zip(col_names, curr_val))
        for field in fields_dict.keys():
            if field not in self.NUMERIC or self.INTEGER or self.BIGINTEGER or self.UNQUOTED:
                field_value="".join(fields_dict[field])
                if (field_value != 'NULL') and (field not in self.UNQUOTED):
                    field_value=field_value.replace('\'','\'\'')
                    fields_dict[field]='\''+field_value+'\''
        sql_columns= ",".join(fields_dict.keys())
        sql_values= ",".join(fields_dict.values())
        schema=''
        if len(self.SCHEMA)>0:
            schema=self.SCHEMA+'.'
        sql = 'insert into '+ schema + tab +" ("+sql_columns+") values ("+sql_values+");\n"
        sql = sql.encode('utf-8')
        self.output(sql)
           
    def generate_uids(self,tab):
        uid=1
        parent_uid=None
        if len(self.getParent(tab))>0:
            if tab in self.uids.keys():
                self.uids[tab]+=1
                uid=self.uids[tab]
                parent_uid=self.uids[self.getParent(tab)]
            else:
                self.uids[tab]=1
                parent_uid=self.uids[self.getParent(tab)]
        else:
            if tab in self.uids.keys():
                self.uids[tab]+=1
                uid=self.uids[tab]
            else:
                self.uids[tab]=1
        return uid,parent_uid

    def generate_uids_seq(self,tab):
        parent_uid=None
        if len(self.getParent(tab))>0:
            parent_uid="currval(pg_get_serial_sequence('%s', 'uid'))"%(self.getParent(tab),)
        return parent_uid

    def generate_insert(self,tab,col_names,val):
        curr_val=[]
        for i in val:
            if i:
                curr_val.append(i.pop())
            else:
                curr_val.append('NULL')

        parent_uid=self.generate_uids_seq(tab)

        if parent_uid != None:
            if 'parent_uid' not in col_names:
                col_names.append('parent_uid')
                curr_val.append(str(parent_uid))
            else:
                curr_val.append(str(parent_uid))
        else:
            col_names.append('file_id')
            curr_val.append(str(self.file_id))

        col_names=self.remapColumns(col_names)      # do remap of Columns
        tab = self.remapTable(tab)                 # do remap of Tables
        self.print_insert_statements(tab,col_names,curr_val)


    def generate_insert_statements(self,columns_data):
        tabs=sorted(self.tables.keys())
        columns=sorted(columns_data)
        for tab in tabs:
            col = self.extract_column_for_table(tab,columns) #Get what is columns not nested tables
            col_names = self.extract_column_names(col)   #Get names of this cloumns
            val =[]
            for i in col:
                val.append(columns_data[i])     #Fetch data for columns           
            max_list = [len(j)  for j in val]
            if len(max_list)>1:
                #print max_list
                for i in range(max(max_list)):
                    self.generate_insert(tab,col_names,val)
            else:
                self.generate_insert(tab,col_names,val)


    def save_uids(self):
        output = open('uids.pkl', 'wb')
        pickle.dump(self.uids, output)
        output.close()
Example #3
0
 def writeToDb(self,choice=True):
     self.db_connector = DBConnector('default')
     self.dbConn=self.db_connector.getConn()
     self.curr=self.db_connector.getCursor()
     self.write_to_db=choice
Example #4
0
import ftplib
import psycopg2
import sys
from helper import DBConnector


try:
    db_conn=DBConnector('default')
    cur=db_conn.getCursor()
    cur.execute("CREATE TABLE files_list (id serial PRIMARY KEY, path varchar unique, insert_time timestamp);commit;")
    del db_conn 
except Exception,e:
    print "Unable create table:",e


db_conn=DBConnector('default')
curr=db_conn.getCursor()

ftp = ftplib.FTP("ftp.zakupki.gov.ru")
ftp.login("free", "free")

files = []

def ftp_walk(ftp):    
    print 'Path:', ftp.pwd()
    dirs = ftp.nlst()
    for item in (path for path in dirs if path not in ('.', '..')):
        try:
            ftp.cwd(item)
            print 'Changed to', ftp.pwd()
Example #5
0
class ZakupkiXMLParser(object):
    def __init__(self):
        self.tables = dict()
        self.values = dict()
        self.uids = dict()
        self.sql_inserts = []
        try:
            uids_file = open('uids.pkl', 'rb')
            self.uids = pickle.load(uids_file)
            uids_file.close()
        except:
            pass
        '''
        if CREATE_TABLES==True:
            try:
                tables_file = open('tables.pkl','rb')
                tables=pickle.load(tables_file)
                tables_file.close()
            except:
                pass
        '''
        self.SCHEMA = schema
        self.PK = ['uid']
        self.FK_IS = ['uid']
        self.FK = ['parent_uid']
        self.INTEGER = [
            'month', 'year', 'id', 'versionNumber', 'kpp', 'countryCode'
        ]
        self.BIGINTEGER = ['inn', 'regNum', 'uid', 'parent_uid']
        self.FLOAT = []
        self.NUMERIC = ['price', 'sum'
                        ]  #'quantity'] Fond that in contracts it can be text
        self.VARCHAR = ['']
        self.UNQUOTED = ['parent_uid']
        # If need modify column names in output or modify tablenames in output

        self.COLUMN_MAPPING = {
            'placing': 'pplacing',
            'order': '"order"',
            'from': '"from"',
        }
        self.TABLES_MAPPING = {
            'protocol[A-Z]{2,}[0-9]|prot[A-Z]{2,}[0-9]': 'prtk',
            'protocol': 'prot',
            #'_protocolLot':'_iprotLot',
            'application': 'app',
            'admission': 'adm',
            'Result': 'Res',
            'Commission': 'Com',
            'Member': 'Mem',
            'Rejected': 'Rej',
            'Reason': 'Reas',
            'Participant': 'Part',
            'Condition': 'Cond',
            'evaluation': 'ev',
            'requirement': 'req',
            'Requirement': 'Req',
            'Compliance': 'Comp',
            'attributes': 'attr',
            'contract': 'contr',
            'criterion': 'crit',
            #'protocol':'prot',
            'notification[A-Z]{2,}[0-9]?': 'notf',
            'notification': 'notif',
            'Feature': 'Feat',
            '-': '',
            '\.': '',
        }

    def setDbConn(self, dbconn):
        '''Set DB connection'''
        self.dbConn = dbconn
        self.dbConn = psycopg2.connect(dbconn)
        self.curr = self.dbConn.cursor()

    def writeToConsole(self, choice=False):
        self.write_to_console = choice

    def writeToDb(self, choice=True):
        self.db_connector = DBConnector('default')
        self.dbConn = self.db_connector.getConn()
        self.curr = self.db_connector.getCursor()
        self.write_to_db = choice

    def output(self, sql):
        if self.write_to_db:
            try:
                self.curr.execute(sql)
                #self.dbConn+'commit;')
            except psycopg2.Error as e:
                self.dbConn.rollback()
                dt = datetime.datetime.now()
                self.curr.execute(
                    'insert into import_log (file_id,time,sql,error) values (%s,%s,%s,%s);',
                    (self.file_id, dt, sql, e.pgerror))
            finally:
                self.dbConn.commit()
        if self.write_to_console:
            sys.stdout.write(sql)

    def uniq(self, seq):
        '''Get Uniq in seq'''
        seen = set()
        seen_add = seen.add
        return [x for x in seq if x not in seen and not seen_add(x)]

    def remapColumns(self, columns):
        result = []
        for i in columns:
            if i in self.COLUMN_MAPPING.keys():
                result.append(self.COLUMN_MAPPING[i])
            else:
                result.append(i)
        return result

    def remapTable(self, table):
        warning = ''
        result = ''
        wrong_simbols = ['-', '.']
        for i in self.TABLES_MAPPING.keys():
            table = re.sub(i, self.TABLES_MAPPING[i], table)
        result = table
        if len(result) > 63:
            warning += '--Warning table name too long:' + table + ':' + table[
                0:63] + '\n'
        if any(s in result for s in wrong_simbols):
            warning += '--Warning wrong simbol in table name:' + result + '\n'
        if len(warning) > 0:
            pass
            #sys.stderr.write('\033[93m'+warning+'\033[0m')
        if len(result) > 0:
            return result
        else:
            return table

    def handle_element(self, parent, elem):
        #print parent,elem
        if len(elem.xpath("./*")) > 0:
            key = parent + '_' + (elem.xpath("local-name()"))
            key = self.remapTable(key)
            if key not in self.tables.keys():
                self.tables[key] = []
                #tables[parent+'_'+(elem.xpath("local-name()"))]=[]
            if key not in self.values.keys():
                self.values[key] = []
                #values[parent+'_'+(elem.xpath("local-name()"))]=[]
            #next_element.append(elem)
        else:
            self.tables[parent] += [elem.xpath("local-name()")]
            column_name = parent + '_' + elem.xpath("local-name()")
            if column_name not in self.values.keys():
                self.values[column_name] = []
                self.values[column_name] += [elem.xpath("text()")]
            else:
                self.values[column_name] += [elem.xpath("text()")]
        for i in elem:
            el = self.remapTable(elem.xpath("local-name()"))
            par = parent + '_' + el
            #par=remapTable(par)
            self.handle_element(par, i)

    def generate_types(self, columns):
        result = []
        for i in columns:
            if i in self.PK:
                result.append(i + ' bigint primary key')
            elif i in self.INTEGER:
                result.append(i + ' integer')
            elif i in self.BIGINTEGER:
                result.append(i + ' bigint')
            elif i in self.VARCHAR:
                result.append(i + ' varchar(2000)')
            elif i in self.FLOAT:
                result.append(i + ' float')
            elif i in self.NUMERIC:
                result.append(i + ' numeric')
            else:
                result.append(i + ' text')
        return result

    def getParent(self, table):
        table_list = table.split('_')
        result = ["_"]
        parent = "_".join(table_list[0:-1])
        result += parent
        result = "".join(parent)
        return result

    def sortByHierarhy(self, tables_list):
        return tables_list.sort()

    def generate_create_tables(self, tables):
        for i in sorted(self.tables.keys()):
            columns = self.uniq(self.tables[i])
            columns.append(self.PK[0])

            columns = self.remapColumns(columns)
            columns = self.generate_types(columns)

            tab = self.remapTable(i)
            columns = ",".join(columns)

            parent = self.getParent(i)
            parent = self.remapTable(parent)

            sql = ''
            schema = ''

            if len(self.SCHEMA) > 0:
                schema = self.SCHEMA + '.'
                if len(parent) == 0:
                    sql = "create table " + schema + tab + " (" + columns + ");"
                else:
                    sql = "create table " + schema + tab + " (" + columns + ", parent_uid bigint references " + parent + "(" + self.FK_IS[
                        0] + ")" + " );\n"
                sys.stdout.write(sql)

    def extract_column_for_table(self, table, columns):
        result = []
        for i in columns:
            prefix = i.split(table)
            if len(prefix) > 1 and i not in self.tables.keys():
                if i.split(table)[1][0] == '_' and len(
                        i.split(table)[1].split('_')) - 1 == 1:
                    result += [i]
        return result

    def extract_column_names(self, raw):
        result = []
        for i in raw:
            tab = self.getParent(i)
            tmp = i.split(tab)
            result.append(tmp[-1][1:])
        return result

    def print_insert_statements(self, table, columns, values):
        tab = table
        col_names = columns
        curr_val = values
        fields_dict = dict(zip(col_names, curr_val))
        for field in fields_dict.keys():
            if field not in self.NUMERIC or self.INTEGER or self.BIGINTEGER or self.UNQUOTED:
                field_value = "".join(fields_dict[field])
                if (field_value != 'NULL') and (field not in self.UNQUOTED):
                    field_value = field_value.replace('\'', '\'\'')
                    fields_dict[field] = '\'' + field_value + '\''
        sql_columns = ",".join(fields_dict.keys())
        sql_values = ",".join(fields_dict.values())
        schema = ''
        if len(self.SCHEMA) > 0:
            schema = self.SCHEMA + '.'
        sql = 'insert into ' + schema + tab + " (" + sql_columns + ") values (" + sql_values + ");\n"
        sql = sql.encode('utf-8')
        self.output(sql)

    def generate_uids(self, tab):
        uid = 1
        parent_uid = None
        if len(self.getParent(tab)) > 0:
            if tab in self.uids.keys():
                self.uids[tab] += 1
                uid = self.uids[tab]
                parent_uid = self.uids[self.getParent(tab)]
            else:
                self.uids[tab] = 1
                parent_uid = self.uids[self.getParent(tab)]
        else:
            if tab in self.uids.keys():
                self.uids[tab] += 1
                uid = self.uids[tab]
            else:
                self.uids[tab] = 1
        return uid, parent_uid

    def generate_uids_seq(self, tab):
        parent_uid = None
        if len(self.getParent(tab)) > 0:
            parent_uid = "currval(pg_get_serial_sequence('%s', 'uid'))" % (
                self.getParent(tab), )
        return parent_uid

    def generate_insert(self, tab, col_names, val):
        curr_val = []
        for i in val:
            if i:
                curr_val.append(i.pop())
            else:
                curr_val.append('NULL')

        parent_uid = self.generate_uids_seq(tab)

        if parent_uid != None:
            if 'parent_uid' not in col_names:
                col_names.append('parent_uid')
                curr_val.append(str(parent_uid))
            else:
                curr_val.append(str(parent_uid))
        else:
            col_names.append('file_id')
            curr_val.append(str(self.file_id))

        col_names = self.remapColumns(col_names)  # do remap of Columns
        tab = self.remapTable(tab)  # do remap of Tables
        self.print_insert_statements(tab, col_names, curr_val)

    def generate_insert_statements(self, columns_data):
        tabs = sorted(self.tables.keys())
        columns = sorted(columns_data)
        for tab in tabs:
            col = self.extract_column_for_table(
                tab, columns)  #Get what is columns not nested tables
            col_names = self.extract_column_names(
                col)  #Get names of this cloumns
            val = []
            for i in col:
                val.append(columns_data[i])  #Fetch data for columns
            max_list = [len(j) for j in val]
            if len(max_list) > 1:
                #print max_list
                for i in range(max(max_list)):
                    self.generate_insert(tab, col_names, val)
            else:
                self.generate_insert(tab, col_names, val)

    def save_uids(self):
        output = open('uids.pkl', 'wb')
        pickle.dump(self.uids, output)
        output.close()
Example #6
0
 def writeToDb(self, choice=True):
     self.db_connector = DBConnector('default')
     self.dbConn = self.db_connector.getConn()
     self.curr = self.db_connector.getCursor()
     self.write_to_db = choice
Example #7
0
import ftplib
import psycopg2
import sys
from helper import DBConnector

try:
    db_conn = DBConnector('default')
    cur = db_conn.getCursor()
    cur.execute(
        "CREATE TABLE files_list (id serial PRIMARY KEY, path varchar unique, insert_time timestamp);commit;"
    )
    del db_conn
except Exception, e:
    print "Unable create table:", e

db_conn = DBConnector('default')
curr = db_conn.getCursor()

ftp = ftplib.FTP("ftp.zakupki.gov.ru")
ftp.login("free", "free")

files = []


def ftp_walk(ftp):
    print 'Path:', ftp.pwd()
    dirs = ftp.nlst()
    for item in (path for path in dirs if path not in ('.', '..')):
        try:
            ftp.cwd(item)
            print 'Changed to', ftp.pwd()