def ResetCons(self): print( '\n\nRESETTING the db connections, this is a standard procedure...\n\n' ) del self.cons['fi']['db'] del self.cons['ru']['db'] self.cons['fi']['db'] = psycopg('tb_fi', 'juho') self.cons['ru']['db'] = psycopg('tb_ru2', 'juho')
def __init__(self, selecteddb): self.columnnames = dict() self.columns = dict() #Initialize printed options self.optionstring = '' self.optiontable = Texttable() self.optiontable.set_cols_align(["l", "l"]) self.optiontable.set_cols_valign(["m", "m"]) self.optiontable.add_row(['Column', 'Possible values']) self.FormatOptionString() self.condcols = None self.headcols = None psycon = psycopg(selecteddb, 'juho') rows = psycon.FetchQuery( 'SELECT column_name FROM information_schema.columns WHERE table_name = %s', (Db.searched_table, )) colindex = 1 for row in rows: #Add a new column object to the columnlist if it makes sense to add it if row[0] not in ConditionSet.ignoredcolumns: self.columns[colindex] = ConllColumn(name=row[0], con=psycon) self.columnnames[str( colindex)] = self.columns[colindex].screenname colindex += 1
def ListColumns(self): if not self.columns: psycon = psycopg(self.selecteddb, 'juho') rows = psycon.FetchQuery( 'SELECT column_name FROM information_schema.columns WHERE table_name = %s', (Db.searched_table, )) for idx, row in enumerate(rows): self.columns[str(idx)] = row[0]
def __init__(self, nouncount=None): """Start loggers, get connections etc""" StartLogger() self.cons = {'fi': dict(), 'ru': dict()} self.cons['fi']['db'] = psycopg('tb_fi', 'juho') self.cons['ru']['db'] = psycopg('tb_ru2', 'juho') self.cons['fi']['table'] = "fi_conll" self.cons['ru']['table'] = "ru_conll" logging.info("Connections established.") self.nouns = {'fi': 0, 'ru': 0} if not nouncount: self.CountNouns() else: self.nouns['fi'] = nouncount['fi'] self.nouns['ru'] = nouncount['ru'] self.results = dict() self.searches = list()
def InsertPair(dbname=None, slfile=None, tlfile=None, sl_tablename=None, tl_tablename=None, reference_file=None, retrans=False): """Method for inserting one file pair either according to cmdline arguments or by function arguments""" if not dbname: #If not run from another method, get command line input: try: if retrans: dbname = sys.argv[2] slfile = sys.argv[3] tlfiles = sys.argv[4:] sl_tablename = 'ru_conll' tl_tablename = 'fi_conll' else: dbname = sys.argv[1] slfile = sys.argv[2] tlfile = sys.argv[3] sl_tablename = sys.argv[4] + '_conll' tl_tablename = sys.argv[5] + '_conll' except IndexError: raise ArgumentError( 'Usage: {} <database name> <sl file> <tl file> <source language> <target language>' .format(sys.argv[0])) con = psycopg(dbname, 'juho') sl = SourceText(sl_tablename, slfile, con, reference_file) sl.CollectSegments() sl.InsertToDb(con) if tl_tablename: if retrans: tltexts = list() for tlfile in tlfiles: tltexts.append( Translation(tl_tablename, tlfile, con, sl.text_id, sl.table)) tltexts[-1].CollectSegments() tltexts[-1].InsertToDb(con) return [sl, tltexts] else: #If this is an ordinary bilingual file tl = Translation(tl_tablename, tlfile, con, sl.text_id, sl.table) tl.CollectSegments() tl.InsertToDb(con)
def __init__(self, test=False, ext_data=None): """Start loggers, get connections etc""" StartLogger() self.cons = dict() self.externaldata = dict() if test: #For testing, use only the smaller database self.cons['fi'] = {'araneum_fi': psycopg('araneum_fi', 'juho')} else: self.cons['fi'] = { 'araneum_fi': psycopg('araneum_fi', 'juho'), 'press_fi': None } self.cons['ru'] = { 'araneum_ru': psycopg('araneum_ru', 'juho'), 'press_ru': psycopg('press_ru', 'juho') } #'press_ru': None} self.externaldata['press_fi'] = None self.metadata = dict() logging.info('Fetching metadata...') if not test: for lang in ['fi', 'ru']: for con_name, con in self.cons[lang].items(): logging.info('{},{}'.format(lang, con_name)) if con: self.metadata[con_name] = con.FetchQuery( 'SELECT id, title FROM text_ids', usedict=True) logging.info("Connections established.") self.results = dict() self.searches = dict()
def main(): #Get command line input: try: conllinputfile = sys.argv[1] sl_dbname = sys.argv[2] tablename = sys.argv[3] except IndexError: print('Usage: {} <path to conll formatted text file> <database name> <source language database table name>'.format(sys.argv[0])) sys.exit(0) #================================================================================ #Connect to db con = psycopg(sl_dbname,'juho') #read the conll data with open(conllinputfile, 'r') as f: conllinput = f.read() # Split the file into aligned segments according to the !!!! -notation splitpattern = re.compile(r"\d+\t![^\n]+\n\n?\d+\t![^\n]+\n\n?\d+\t![^\n]+\n\n?\d+\t![^\n]+\n\n") alignsegments = re.split(splitpattern,conllinput) #Filter out empty align segments alignsegments = TrimList(alignsegments) #Get the current maximum indices: sentence_id = GetLastValue(con.FetchQuery("SELECT max(sentence_id) FROM {}".format(tablename))) align_id = GetLastValue(con.FetchQuery("SELECT max(align_id) FROM {}".format(tablename))) #Insert a new entry in the text_ids table con.query("INSERT INTO text_ids (title) values(%s)", (input('Give a title for this text:\n'),)) text_id = GetLastValue(con.FetchQuery("SELECT max(id) FROM text_ids")) #Initialize variales for db insertion rowlist = list() bar = Bar('Preparing the data for insertion into the database', max=len(alignsegments)) #================================================================================ for segment in alignsegments: #Split each segment into lines (line=word with all the morphological and syntactic information) words = segment.splitlines() align_id += 1 sentence_id += 1 for word in words: #read all the information about the word if word == '': #empty lines are sentence breaks sentence_id += 1 else: columns = word.split('\t') if len(columns) < 6: #If an empty segment encountered print('Note: an empty segment encountered at align_id {}'.format(align_id)) rowlist.append({'align_id' : align_id, 'sentence_id' : sentence_id, 'text_id' : text_id, 'tokenid' : 1, 'token' : 'EMPTYSEGMENT', 'lemma' : 'EMPTYSEGMENT', 'pos' : 'EMPTYSEGMENT', 'feat' : 'EMPTYSEGMENT', 'head' : 0, 'deprel' : 'EMPTY'}) else: #If this is a word with information, initialize a new row if tablename == 'ru_conll': rowlist.append({'align_id' : align_id, 'sentence_id' : sentence_id, 'text_id' : text_id, 'tokenid' : columns[0], 'token' : columns[1], 'lemma' : columns[2], 'pos' : columns[4], 'feat' : columns[5], 'head' : columns[6], 'deprel' : columns[7]}) elif tablename == 'fi_conll': rowlist.append({'align_id' : align_id, 'sentence_id' : sentence_id, 'text_id' : text_id, 'tokenid' : columns[0], 'token' : columns[1], 'lemma' : columns[2], 'pos' : columns[4], 'feat' : columns[6], 'head' : columns[8], 'deprel' : columns[10]}) bar.next() #================================================================================ bar.finish() print('\nInserting to database, this might take a while...') con.BatchInsert(tablename,rowlist) print('Done. Inserted {} rows.'.format(con.cur.rowcount))
from dbmodule import psycopg con = psycopg("araneum_fi", 'juho') sids = con.FetchQuery( "SELECT DISTINCT sentence_id FROM fi_conll WHERE sentence_id > 1199689") for sid in sids: con.query( "INSERT INTO groups (name, sentence_id, corpus) values(%s, %s, %s)", ("lc1b", sid[0], "araneum"), commit=True)
def main(): #Get command line input: try: conllinputfile = sys.argv[1] text_id = sys.argv[2] dbname = sys.argv[3] sl_dbtablename = sys.argv[4] tl_dbtablename = sys.argv[5] except: print('''Usage: {} <path to target language conll formatted text> <text id of the inserted source language text> <database name> <source language database table name> <target language database table name> '''.format(sys.argv[0])) sys.exit(0) #Connect to the database con = psycopg(dbname, 'juho') #read the conll data with open(conllinputfile, 'r') as f: conllinput = f.read() #fetch the id of the pair that is already inserted text_id = con.FetchQuery( "SELECT id FROM {} WHERE id = %s".format('text_ids'), (text_id, )) try: text_id = text_id[0][0] except IndexError: raise MissingTextError('No such id in the text_ids table') #Get all the align ids that were inserted with the first file align_ids = con.FetchQuery( "SELECT DISTINCT align_id FROM {} WHERE text_id = %s order by align_id" .format(sl_dbtablename), (text_id, )) # Split the translation file into aligned segments according to the !!!! -notation splitpattern = re.compile( r"\d+\t![^\n]+\n\n?\d+\t![^\n]+\n\n?\d+\t![^\n]+\n\n?\d+\t![^\n]+\n\n") alignsegments = re.split(splitpattern, conllinput) #Filter out empty align segments alignsegments = TrimList(alignsegments) #Test that same number of segments if len(alignsegments) != len(align_ids): raise AlignMismatch( 'The number of segments differs from the number in the source text: {}/{}' .format(len(alignsegments), len(align_ids))) #Get the current maximum indices: sentence_id = GetLastValue( con.FetchQuery( "SELECT max(sentence_id) FROM {}".format(tl_dbtablename))) #Insert a new entry in the translation_ids table translator = input('Give the author for this translation:\n') con.query( "INSERT INTO translation_ids (translator, sourcetext_id) VALUES(%s, %s)", ( translator, text_id, ), commit=True) translation_id = GetLastValue( con.FetchQuery( "SELECT max(id) FROM translation_ids WHERE sourcetext_id = %(sid)s", {'sid': text_id})) #Initialize variales for db insertion rowlist = list() bar = Bar('Preparing the data for insertion into the database', max=len(alignsegments)) #================================================================================ for idx, align_id in enumerate(align_ids): align_id = align_id[0] segment = alignsegments[idx] #Split each segment into lines (line=word with all the morphological and syntactic information) words = segment.splitlines() sentence_id += 1 for word in words: #read all the information about the word if word == '': #empty lines are sentence breaks sentence_id += 1 else: columns = word.split('\t') if len(columns) < 7: #If an empty segment encountered print('Note: an empty segment encountered at align_id {}'. format(align_id)) rowlist.append({ 'align_id': align_id, 'sentence_id': sentence_id, 'text_id': text_id, 'translation_id': translation_id, 'tokenid': 1, 'token': 'EMPTYSEGMENT', 'lemma': 'EMPTYSEGMENT', 'pos': 'EMPTYSEGMENT', 'feat': 'EMPTYSEGMENT', 'head': 0, 'deprel': 'EMPTY' }) else: #If this is a word with information, initialize a new row if sl_dbtablename == 'fi_conll': rowlist.append({ 'align_id': align_id, 'sentence_id': sentence_id, 'text_id': text_id, 'translation_id': translation_id, 'tokenid': columns[0], 'token': columns[1], 'lemma': columns[2], 'pos': columns[4], 'feat': columns[5], 'head': columns[6], 'deprel': columns[7] }) elif sl_dbtablename == 'ru_conll': rowlist.append({ 'align_id': align_id, 'sentence_id': sentence_id, 'text_id': text_id, 'translation_id': translation_id, 'tokenid': columns[0], 'token': columns[1], 'lemma': columns[2], 'pos': columns[4], 'feat': columns[6], 'head': columns[8], 'deprel': columns[10] }) bar.next() #================================================================================ bar.finish() print('\nInserting to database, this might take a while...') con.BatchInsert(tl_dbtablename, rowlist) print('Done. Inserted {} rows.'.format(con.cur.rowcount))
# Commit: con.connection.commit() if __name__ == "__main__": #Initialize a logger and start the function that creates the contrastive layer root = logging.getLogger() root.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s: %(message)s') ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.DEBUG) fh = logging.FileHandler('logof_contrastivelayer.txt') fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) ch.setFormatter(formatter) root.addHandler(fh) root.addHandler(ch) #Connect: prcon = psycopg('syntparrus','juho') pfcon = psycopg('syntparfin','juho') logging.info('\n{0} \nSTART CREATING THE CONTRASTIVE LAYER \n{0} \n'.format('*'*60)) #logging.info('\n{0} \n The ParRus database \n{0} \n'.format('-'*60)) #createContrastiveLayer(prcon) logging.info('\n{0} \n The ParFin database \n{0} \n'.format('-'*60)) createContrastiveLayer(pfcon)