Ejemplo n.º 1
0
    def ResetCons(self):

        print(
            '\n\nRESETTING the db connections, this is a standard procedure...\n\n'
        )

        del self.cons['fi']['db']
        del self.cons['ru']['db']

        self.cons['fi']['db'] = psycopg('tb_fi', 'juho')
        self.cons['ru']['db'] = psycopg('tb_ru2', 'juho')
Ejemplo n.º 2
0
    def __init__(self, selecteddb):
        self.columnnames = dict()
        self.columns = dict()

        #Initialize printed options
        self.optionstring = ''
        self.optiontable = Texttable()
        self.optiontable.set_cols_align(["l", "l"])
        self.optiontable.set_cols_valign(["m", "m"])
        self.optiontable.add_row(['Column', 'Possible values'])
        self.FormatOptionString()
        self.condcols = None
        self.headcols = None

        psycon = psycopg(selecteddb, 'juho')
        rows = psycon.FetchQuery(
            'SELECT column_name FROM information_schema.columns WHERE table_name = %s',
            (Db.searched_table, ))
        colindex = 1
        for row in rows:
            #Add a new column object to the columnlist if it makes sense to add it
            if row[0] not in ConditionSet.ignoredcolumns:
                self.columns[colindex] = ConllColumn(name=row[0], con=psycon)
                self.columnnames[str(
                    colindex)] = self.columns[colindex].screenname
                colindex += 1
Ejemplo n.º 3
0
 def ListColumns(self):
     if not self.columns:
         psycon = psycopg(self.selecteddb, 'juho')
         rows = psycon.FetchQuery(
             'SELECT column_name FROM information_schema.columns WHERE table_name = %s',
             (Db.searched_table, ))
         for idx, row in enumerate(rows):
             self.columns[str(idx)] = row[0]
Ejemplo n.º 4
0
 def __init__(self, nouncount=None):
     """Start loggers, get connections etc"""
     StartLogger()
     self.cons = {'fi': dict(), 'ru': dict()}
     self.cons['fi']['db'] = psycopg('tb_fi', 'juho')
     self.cons['ru']['db'] = psycopg('tb_ru2', 'juho')
     self.cons['fi']['table'] = "fi_conll"
     self.cons['ru']['table'] = "ru_conll"
     logging.info("Connections established.")
     self.nouns = {'fi': 0, 'ru': 0}
     if not nouncount:
         self.CountNouns()
     else:
         self.nouns['fi'] = nouncount['fi']
         self.nouns['ru'] = nouncount['ru']
     self.results = dict()
     self.searches = list()
Ejemplo n.º 5
0
def InsertPair(dbname=None,
               slfile=None,
               tlfile=None,
               sl_tablename=None,
               tl_tablename=None,
               reference_file=None,
               retrans=False):
    """Method for inserting one file pair either according to cmdline arguments or by function arguments"""
    if not dbname:
        #If not run from another method, get command line input:
        try:
            if retrans:
                dbname = sys.argv[2]
                slfile = sys.argv[3]
                tlfiles = sys.argv[4:]
                sl_tablename = 'ru_conll'
                tl_tablename = 'fi_conll'
            else:
                dbname = sys.argv[1]
                slfile = sys.argv[2]
                tlfile = sys.argv[3]
                sl_tablename = sys.argv[4] + '_conll'
                tl_tablename = sys.argv[5] + '_conll'
        except IndexError:
            raise ArgumentError(
                'Usage: {} <database name> <sl file> <tl file> <source language> <target language>'
                .format(sys.argv[0]))

    con = psycopg(dbname, 'juho')

    sl = SourceText(sl_tablename, slfile, con, reference_file)
    sl.CollectSegments()
    sl.InsertToDb(con)

    if tl_tablename:
        if retrans:
            tltexts = list()
            for tlfile in tlfiles:
                tltexts.append(
                    Translation(tl_tablename, tlfile, con, sl.text_id,
                                sl.table))
                tltexts[-1].CollectSegments()
                tltexts[-1].InsertToDb(con)
            return [sl, tltexts]
        else:
            #If this is an ordinary bilingual file
            tl = Translation(tl_tablename, tlfile, con, sl.text_id, sl.table)
            tl.CollectSegments()
            tl.InsertToDb(con)
Ejemplo n.º 6
0
    def __init__(self, test=False, ext_data=None):
        """Start loggers, get connections etc"""
        StartLogger()
        self.cons = dict()
        self.externaldata = dict()

        if test:
            #For testing, use only the smaller database
            self.cons['fi'] = {'araneum_fi': psycopg('araneum_fi', 'juho')}
        else:
            self.cons['fi'] = {
                'araneum_fi': psycopg('araneum_fi', 'juho'),
                'press_fi': None
            }
            self.cons['ru'] = {
                'araneum_ru': psycopg('araneum_ru', 'juho'),
                'press_ru': psycopg('press_ru', 'juho')
            }
            #'press_ru': None}

        self.externaldata['press_fi'] = None

        self.metadata = dict()
        logging.info('Fetching metadata...')
        if not test:
            for lang in ['fi', 'ru']:
                for con_name, con in self.cons[lang].items():
                    logging.info('{},{}'.format(lang, con_name))
                    if con:
                        self.metadata[con_name] = con.FetchQuery(
                            'SELECT id, title FROM text_ids', usedict=True)

        logging.info("Connections established.")

        self.results = dict()
        self.searches = dict()
Ejemplo n.º 7
0
def main():

    #Get command line input:
    try:
        conllinputfile = sys.argv[1]
        sl_dbname = sys.argv[2]
        tablename = sys.argv[3]
    except IndexError:
        print('Usage: {} <path to conll formatted text file> <database name> <source language database table name>'.format(sys.argv[0]))
        sys.exit(0)

    #================================================================================

    #Connect to db
    con = psycopg(sl_dbname,'juho')
    #read the conll data
    with open(conllinputfile, 'r') as f:
        conllinput = f.read()

    # Split the file into aligned segments according to the !!!! -notation
    splitpattern = re.compile(r"\d+\t![^\n]+\n\n?\d+\t![^\n]+\n\n?\d+\t![^\n]+\n\n?\d+\t![^\n]+\n\n")
    alignsegments = re.split(splitpattern,conllinput)
    #Filter out empty align segments
    alignsegments = TrimList(alignsegments)

    #Get the current maximum indices:
    sentence_id = GetLastValue(con.FetchQuery("SELECT max(sentence_id) FROM {}".format(tablename)))
    align_id    = GetLastValue(con.FetchQuery("SELECT max(align_id) FROM {}".format(tablename)))
    #Insert a new entry in the text_ids table
    con.query("INSERT INTO text_ids (title) values(%s)", (input('Give a title for this text:\n'),))
    text_id     = GetLastValue(con.FetchQuery("SELECT max(id) FROM text_ids"))

    #Initialize variales for db insertion
    rowlist = list()
    bar = Bar('Preparing the data for insertion into the database', max=len(alignsegments))

    #================================================================================
    for segment in alignsegments:
        #Split each segment into lines (line=word with all the morphological and syntactic information)
        words = segment.splitlines()
        align_id    += 1
        sentence_id += 1
        for word in words:
            #read all the information about the word
            if word == '':
                #empty lines are sentence breaks
                sentence_id += 1
            else:
                columns = word.split('\t')
                if len(columns) < 6:
                    #If an empty segment encountered
                    print('Note: an empty segment encountered at align_id {}'.format(align_id))
                    rowlist.append({'align_id'    : align_id,
                                    'sentence_id' : sentence_id,
                                    'text_id'     : text_id,
                                    'tokenid'     : 1,
                                    'token'       : 'EMPTYSEGMENT',
                                    'lemma'       : 'EMPTYSEGMENT',
                                    'pos'         : 'EMPTYSEGMENT',
                                    'feat'        : 'EMPTYSEGMENT',
                                    'head'        : 0,
                                    'deprel'      : 'EMPTY'})
                else:
                    #If this is a word with information, initialize a new row
                    if tablename == 'ru_conll':
                        rowlist.append({'align_id'    : align_id,
                                        'sentence_id' : sentence_id,
                                        'text_id'     : text_id,
                                        'tokenid'     : columns[0],
                                        'token'       : columns[1],
                                        'lemma'       : columns[2],
                                        'pos'         : columns[4],
                                        'feat'        : columns[5],
                                        'head'        : columns[6],
                                        'deprel'      : columns[7]})

                    elif tablename == 'fi_conll':
                        rowlist.append({'align_id'    : align_id,
                                        'sentence_id' : sentence_id,
                                        'text_id'     : text_id,
                                        'tokenid'     : columns[0],
                                        'token'       : columns[1],
                                        'lemma'       : columns[2],
                                        'pos'         : columns[4],
                                        'feat'        : columns[6],
                                        'head'        : columns[8],
                                        'deprel'      : columns[10]})
        bar.next()
    #================================================================================

    bar.finish()
    print('\nInserting to database, this might take a while...')
    con.BatchInsert(tablename,rowlist)
    print('Done. Inserted {} rows.'.format(con.cur.rowcount))
Ejemplo n.º 8
0
from dbmodule import psycopg

con = psycopg("araneum_fi", 'juho')

sids = con.FetchQuery(
    "SELECT DISTINCT sentence_id FROM fi_conll WHERE sentence_id > 1199689")
for sid in sids:
    con.query(
        "INSERT INTO groups (name, sentence_id, corpus) values(%s, %s, %s)",
        ("lc1b", sid[0], "araneum"),
        commit=True)
Ejemplo n.º 9
0
def main():
    #Get command line input:
    try:
        conllinputfile = sys.argv[1]
        text_id = sys.argv[2]
        dbname = sys.argv[3]
        sl_dbtablename = sys.argv[4]
        tl_dbtablename = sys.argv[5]
    except:
        print('''Usage: {} 
        <path to target language conll formatted text>
        <text id of the inserted source language text>
        <database name>
        <source language database table name>
        <target language database table name>
        '''.format(sys.argv[0]))
        sys.exit(0)

    #Connect to the database
    con = psycopg(dbname, 'juho')
    #read the conll data
    with open(conllinputfile, 'r') as f:
        conllinput = f.read()

    #fetch the id of the pair that is already inserted
    text_id = con.FetchQuery(
        "SELECT id FROM {} WHERE id = %s".format('text_ids'), (text_id, ))
    try:
        text_id = text_id[0][0]
    except IndexError:
        raise MissingTextError('No such id in the text_ids table')

    #Get all the align ids that were inserted with the first file
    align_ids = con.FetchQuery(
        "SELECT DISTINCT align_id FROM {} WHERE text_id = %s order by align_id"
        .format(sl_dbtablename), (text_id, ))

    # Split the translation file into aligned segments according to the !!!! -notation
    splitpattern = re.compile(
        r"\d+\t![^\n]+\n\n?\d+\t![^\n]+\n\n?\d+\t![^\n]+\n\n?\d+\t![^\n]+\n\n")
    alignsegments = re.split(splitpattern, conllinput)
    #Filter out empty align segments
    alignsegments = TrimList(alignsegments)

    #Test that same number of segments
    if len(alignsegments) != len(align_ids):
        raise AlignMismatch(
            'The number of segments differs from the number in the source text: {}/{}'
            .format(len(alignsegments), len(align_ids)))

    #Get the current maximum indices:
    sentence_id = GetLastValue(
        con.FetchQuery(
            "SELECT max(sentence_id) FROM {}".format(tl_dbtablename)))
    #Insert a new entry in the translation_ids table
    translator = input('Give the author for this translation:\n')
    con.query(
        "INSERT INTO translation_ids (translator, sourcetext_id) VALUES(%s, %s)",
        (
            translator,
            text_id,
        ),
        commit=True)
    translation_id = GetLastValue(
        con.FetchQuery(
            "SELECT max(id) FROM translation_ids WHERE sourcetext_id = %(sid)s",
            {'sid': text_id}))

    #Initialize variales for db insertion
    rowlist = list()
    bar = Bar('Preparing the data for insertion into the database',
              max=len(alignsegments))

    #================================================================================
    for idx, align_id in enumerate(align_ids):
        align_id = align_id[0]
        segment = alignsegments[idx]
        #Split each segment into lines (line=word with all the morphological and syntactic information)
        words = segment.splitlines()
        sentence_id += 1
        for word in words:
            #read all the information about the word
            if word == '':
                #empty lines are sentence breaks
                sentence_id += 1
            else:
                columns = word.split('\t')
                if len(columns) < 7:
                    #If an empty segment encountered
                    print('Note: an empty segment encountered at align_id {}'.
                          format(align_id))
                    rowlist.append({
                        'align_id': align_id,
                        'sentence_id': sentence_id,
                        'text_id': text_id,
                        'translation_id': translation_id,
                        'tokenid': 1,
                        'token': 'EMPTYSEGMENT',
                        'lemma': 'EMPTYSEGMENT',
                        'pos': 'EMPTYSEGMENT',
                        'feat': 'EMPTYSEGMENT',
                        'head': 0,
                        'deprel': 'EMPTY'
                    })
                else:
                    #If this is a word with information, initialize a new row
                    if sl_dbtablename == 'fi_conll':
                        rowlist.append({
                            'align_id': align_id,
                            'sentence_id': sentence_id,
                            'text_id': text_id,
                            'translation_id': translation_id,
                            'tokenid': columns[0],
                            'token': columns[1],
                            'lemma': columns[2],
                            'pos': columns[4],
                            'feat': columns[5],
                            'head': columns[6],
                            'deprel': columns[7]
                        })

                    elif sl_dbtablename == 'ru_conll':
                        rowlist.append({
                            'align_id': align_id,
                            'sentence_id': sentence_id,
                            'text_id': text_id,
                            'translation_id': translation_id,
                            'tokenid': columns[0],
                            'token': columns[1],
                            'lemma': columns[2],
                            'pos': columns[4],
                            'feat': columns[6],
                            'head': columns[8],
                            'deprel': columns[10]
                        })
        bar.next()
    #================================================================================

    bar.finish()
    print('\nInserting to database, this might take a while...')
    con.BatchInsert(tl_dbtablename, rowlist)
    print('Done. Inserted {} rows.'.format(con.cur.rowcount))
Ejemplo n.º 10
0
    # Commit:
    con.connection.commit()

if __name__ == "__main__":
    #Initialize a logger and start the function that creates the contrastive layer

    root = logging.getLogger()
    root.setLevel(logging.DEBUG)
    formatter = logging.Formatter('%(asctime)s: %(message)s')

    ch = logging.StreamHandler(sys.stdout)
    ch.setLevel(logging.DEBUG)

    fh = logging.FileHandler('logof_contrastivelayer.txt')
    fh.setLevel(logging.DEBUG)

    fh.setFormatter(formatter)
    ch.setFormatter(formatter)
    root.addHandler(fh)
    root.addHandler(ch)

    #Connect:
    prcon = psycopg('syntparrus','juho')
    pfcon = psycopg('syntparfin','juho')
    logging.info('\n{0} \nSTART CREATING THE CONTRASTIVE LAYER \n{0} \n'.format('*'*60))
    #logging.info('\n{0} \n The ParRus database \n{0} \n'.format('-'*60))
    #createContrastiveLayer(prcon)
    logging.info('\n{0} \n The ParFin database \n{0} \n'.format('-'*60))
    createContrastiveLayer(pfcon)