Example #1
0
def buildIndex():

    CONN_STRING = global_setting.get_CONN()
    con = None
    try:
        con = psycopg2.connect(CONN_STRING)
        con.set_client_encoding('UTF8')
        cur = con.cursor()

        # build index on title

        langs = ['en', 'es', 'ru', 'fa']
        for lang in langs:
            query1 = 'CREATE INDEX ' + lang + 'IndexTitle ON wiki_' + lang + '(title)'
            query2 = 'CREATE INDEX ' + lang + 'IndexLowerTitle ON wiki_' + lang + '(lower(title))'
            print query1
            cur.execute(query1)
            con.commit()
            print query2
            cur.execute(query2)
            con.commit()

    except psycopg2.DatabaseError, e:
        print 'Error %s' % e
        sys.exit(-1)
Example #2
0
def create_table(table_name):
    # CONN_STRING: you need to change it
    CONN_STRING = global_setting.get_CONN()

    try:

        con = psycopg2.connect(CONN_STRING)
        con.set_client_encoding('UTF8')

        #check wheter table exsits

        query = "select * from information_schema.tables where table_name='" + table_name + "'"
        cur = con.cursor()
        cur.execute(query)
        rows = cur.fetchall()

        if len(rows) == 0:  #table doesn't exist
            cur = con.cursor()
            query = 'create table ' + table_name + '(id char(12) PRIMARY KEY,lang char(2),title text,wiki_url text, abstract text, parse_result text)'
            cur.execute(query)
            con.commit()

        return con
    except psycopg2.DatabaseError, e:
        print 'Error %s' % e
        sys.exit(-1)
Example #3
0
def buildIndex():
    CONN_STRING = global_setting.get_CONN()
    con = None
    try:
        con = psycopg2.connect(CONN_STRING)
        con.set_client_encoding('UTF8')
        cur = con.cursor()

        # Build index on title

        langs = ['en', 'es', 'ru', 'fa']
        for lang in langs:
            query1 = 'CREATE INDEX ' + lang + 'IndexTitle ON wiki_' + \
                     lang + '(title)'
            query2 = 'CREATE INDEX ' + lang + 'IndexLowerTitle ON wiki_' + \
                     lang + '(lower(title))'
            print query1
            cur.execute(query1)
            con.commit()
            print query2
            cur.execute(query2)
            con.commit()

    except psycopg2.DatabaseError, e:
        print 'Error %s' % e
        sys.exit(-1)
Example #4
0
def create_table(table_name):
    # CONN_STRING: you need to change it 
    CONN_STRING= global_setting.get_CONN()

    try:

        con = psycopg2.connect(CONN_STRING)
        con.set_client_encoding('UTF8')

        #check wheter table exsits

        query = "select * from information_schema.tables where table_name='"+table_name+"'"
        cur = con.cursor()
        cur.execute(query)
        rows=cur.fetchall()

        if len(rows) == 0 : #table doesn't exist
            cur = con.cursor()
            query='create table '+table_name+'(id char(12) PRIMARY KEY,lang char(2),title text,wiki_url text, abstract text, parse_result text)'
            cur.execute(query)
            con.commit()
        
        return con
    except psycopg2.DatabaseError, e:
        print 'Error %s' % e
        sys.exit(-1)
def main():
    # Database settings
    CONN_STRING = global_setting.get_CONN()
    con = None

    from optparse import OptionParser

    # Option
    usage = "usage: %prog [options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-i",
                      "--input",
                      dest="inword",
                      help="input string (example: \"Barack Obama\")")
    parser.add_option("-l",
                      "--lang",
                      dest="lang",
                      help="language (one of EN|RU|ES|FA)")
    parser.add_option(
        "-s",
        "--substring",
        dest="substring",
        action="store_true",
        help="match input string as substring (default is exact match)",
        default=False)
    parser.add_option(
        "-c",
        "--casesensitive",
        dest="case_sensitive",
        action="store_true",
        help=
        "match input string as case-sensitive (default is case-insensitive)",
        default=False)
    parser.add_option(
        "-p",
        "--preferredmeaning",
        dest="preferred_meaning",
        action="store_true",
        help="return preferred meaning of category (default is NOT preferred)",
        default=False)
    parser.add_option("-d",
                      "--debug",
                      dest="debug",
                      action="store_true",
                      help="output debug info, default is false",
                      default=False)
    parser.add_option("--stdout",
                      dest='stdout',
                      action='store_true',
                      help='direct write the output to stdout',
                      default=False)
    (options, args) = parser.parse_args()

    if not options.inword:
        parser.error(
            "Must supply input string. (Example: -i \"Barack Obama\")")
    if not options.lang:
        parser.error(
            "Must supply language. (Example: -l EN; allowed languages: EN|ES|RU|FA)"
        )

    inword = options.inword
    lang = options.lang
    substring = options.substring
    case_sensitive = options.case_sensitive
    preferred_meaning = options.preferred_meaning
    debug = options.debug
    stdout = options.stdout
    # Prepare language suffix for yago and yago
    langIndex = {"EN": 0, "ES": 1, "RU": 2, "FA": 3}
    qlangs = ['@eng', '@spa', '@rus', '@fas']
    wlangs = ['@en', '@es', '@ru', '@fa']
    tables = ['wiki_en', 'wiki_es', 'wiki_ru', 'wiki_fa']
    lindex = langIndex[lang]
    qlang = qlangs[lindex]
    wlang = wlangs[lindex]
    table_name = tables[lindex]

    if preferred_meaning:
        myset = setting()
        myset.inword = inword
        myset.substring = substring
        myset.case_sensitive = case_sensitive
        myset.preferred_meaning = preferred_meaning
        myset.qlang = qlang
        myset.wlang = wlang
        myset.table_name = table_name
        myset.lang = lang
        myset.CONN_STRING = CONN_STRING
        myset.debug = debug
        goon = get_paragraph_prefer.prefer_search(myset)
        if goon == 0:
            return 0

    inword = inword.replace(" ", "_")
    query = "select title,abstract from TABLE_NAME where title ilike '@@@word@@@'"
    query = query.replace('TABLE_NAME', table_name)

    # Prepare search word
    if substring:
        inword = '%' + inword + '%'
    else:
        # Exact match
        inword = inword

    # Build query
    if case_sensitive and not substring:  # exact match
        query = query.replace("ilike", "=")
    if case_sensitive:
        query = query.replace('ilike', 'like')

    query = query.replace('@@@word@@@', inword)
    if debug:
        print "Query:", query

    try:
        # Change CONN_STRING accordingly.
        con = psycopg2.connect(CONN_STRING)
        con.set_client_encoding('UTF8')
        cur = con.cursor(cursor_factory=psycopg2.extras.DictCursor)
        cur.execute(query)
        # Get result
        rows = cur.fetchall()
        i = 0
        if not stdout:
            for row in rows:
                i += 1
                print '#' + str(i) + " TITLE: " + row['title']
                print '#' + str(i) + " ABSTRACT: " + row['abstract']
        else:
            if len(rows) > 0:
                print rows[0]['abstract']
            else:
                print 'no result!'
    except psycopg2.DatabaseError, e:
        print 'Error %s' % e
        sys.exit(1)
def main():
    from optparse import OptionParser

    # option
    usage = "usage: %prog [options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-d",
                      "--dir",
                      dest="dirPath",
                      help="the *.nt files dir path")
    (options, args) = parser.parse_args()

    if not options.dirPath:
        parser.error("Please input the dir path")

    dirPath = options.dirPath
    filePath = dirPath + '/' + 'interlanguage_links_same_as_en.nt'
    CONN_STRING = global_setting.get_CONN()

    # create table
    try:
        con = psycopg2.connect(CONN_STRING)
        con.set_client_encoding('UTF8')
        cur = con.cursor()

        query = 'DROP TABLE IF EXISTS multilink;'
        output = cur.execute(query)
        con.commit()

        print query
        query = 'CREATE TABLE multiLink(engTitle varchar,lang varchar, otherTitle varchar);'
        cur.execute(query)
        con.commit()

        print query
        # process files
        file = open(filePath, 'r')
        file.readline()
        records = []
        i = 0
        while True:
            line = file.readline()
            if not line:
                if len(records) > 0:
                    insert_records(con, records)
                    records = []
                break
            line = line.decode('raw_unicode_escape')
            ll = line.split(' ')
            lang = ''
            if ll[2].startswith('<http://es.dbpedia.org/resource'):
                lang = 'ES'
            if ll[2].startswith('<http://fa.dbpedia.org/resource'):
                lang = 'FA'
            if ll[2].startswith('<http://ru.dbpedia.org/resource'):
                lang = 'RU'
            if lang == '':
                continue
            i += 1
            engTitle = extractTitle(ll[0])
            otherTitle = extractTitle(ll[2])
            record = (engTitle, lang, otherTitle)
            records.append(record)
            if len(records) > 100:
                insert_records(con, records)
                records = []
            if i % 10000 == 0:
                print '%d records inserted!', i
        print i

        con.commit()
        # buildIndex
        query = 'CREATE INDEX multilink_engTitle on multilink(engTitle);'
        cur.execute(query)
        con.commit()

    except psycopg2.DatabaseError, e:
        print 'Error %s' % e
        sys.exit(-1)
def main():
    # Database settings
    CONN_STRING = global_setting.get_CONN()
    con = None

    from optparse import OptionParser

    # Option
    usage = "usage: %prog [options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-i", "--input", dest="inword",
                      help="input string (example: \"Barack Obama\")")
    parser.add_option("-l", "--lang", dest="lang",
                      help="language (one of EN|RU|ES|FA)")
    parser.add_option("-s", "--substring", dest="substring",
                      action="store_true",
                      help="match input string as substring (default is exact match)",
                      default=False)
    parser.add_option("-c", "--casesensitive", dest="case_sensitive",
                      action="store_true",
                      help="match input string as case-sensitive (default is case-insensitive)",
                      default=False)
    parser.add_option("-p", "--preferredmeaning", dest="preferred_meaning",
                      action="store_true",
                      help="return preferred meaning of category (default is NOT preferred)",
                      default=False)
    parser.add_option("-d", "--debug", dest="debug", action="store_true",
                      help="output debug info, default is false",
                      default=False)
    parser.add_option("--stdout", dest='stdout', action='store_true',
                      help='direct write the output to stdout',
                      default=False)
    (options, args) = parser.parse_args()

    if not options.inword:
        parser.error("Must supply input string. (Example: -i \"Barack Obama\")")
    if not options.lang:
        parser.error("Must supply language. (Example: -l EN; allowed languages: EN|ES|RU|FA)")

    inword = options.inword
    lang = options.lang
    substring = options.substring
    case_sensitive = options.case_sensitive
    preferred_meaning = options.preferred_meaning
    debug = options.debug
    stdout = options.stdout
    # Prepare language suffix for yago and yago
    langIndex = {"EN": 0, "ES": 1, "RU": 2, "FA": 3}
    qlangs = ['@eng', '@spa', '@rus', '@fas']
    wlangs = ['@en', '@es', '@ru', '@fa']
    tables = ['wiki_en', 'wiki_es', 'wiki_ru', 'wiki_fa']
    lindex = langIndex[lang]
    qlang = qlangs[lindex]
    wlang = wlangs[lindex]
    table_name = tables[lindex]

    if preferred_meaning:
        myset = setting()
        myset.inword = inword
        myset.substring = substring
        myset.case_sensitive = case_sensitive
        myset.preferred_meaning = preferred_meaning
        myset.qlang = qlang
        myset.wlang = wlang
        myset.table_name = table_name
        myset.lang = lang
        myset.CONN_STRING = CONN_STRING
        myset.debug = debug
        goon = get_paragraph_prefer.prefer_search(myset)
        if goon == 0:
            return 0

    inword = inword.replace(" ", "_")
    query = "select title,abstract from TABLE_NAME where title ilike '@@@word@@@'"
    query = query.replace('TABLE_NAME', table_name)

    # Prepare search word
    if substring:
        inword = '%'+inword+'%'
    else:
        # Exact match
        inword = inword

    # Build query
    if case_sensitive and not substring:  # exact match
        query = query.replace("ilike", "=")
    if case_sensitive:
        query = query.replace('ilike', 'like')

    query = query.replace('@@@word@@@', inword)
    if debug:
        print "Query:", query

    try:
        # Change CONN_STRING accordingly.
        con = psycopg2.connect(CONN_STRING)
        con.set_client_encoding('UTF8')
        cur = con.cursor(cursor_factory=psycopg2.extras.DictCursor)
        cur.execute(query)
        # Get result
        rows = cur.fetchall()
        i = 0
        if not stdout:
            for row in rows:
                i += 1
                print '#' + str(i) + " TITLE: " + row['title']
                print '#' + str(i) + " ABSTRACT: " + row['abstract']
        else:
            if len(rows) > 0:
                print rows[0]['abstract']
            else:
                print 'no result!'
    except psycopg2.DatabaseError, e:
        print 'Error %s' % e
        sys.exit(1)
def main():
    from optparse import OptionParser
    
    #option
    usage = "usage: %prog [options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-d","--dir", dest="dirPath",help="the *.nt files dir path")
    (options,args) = parser.parse_args()
    
    if not options.dirPath:
        parser.error("Please input the dir path")
    
    dirPath=options.dirPath
    filePath = dirPath + '/' + 'interlanguage_links_same_as_en.nt'
    CONN_STRING = global_setting.get_CONN()

    # create table
    try:
        con = psycopg2.connect(CONN_STRING)
        con.set_client_encoding('UTF8')
        cur = con.cursor()

        query = 'DROP TABLE IF EXISTS multilink;'
        output = cur.execute(query)
        con.commit()
        
        print query
        query = 'CREATE TABLE multiLink(engTitle varchar,lang varchar, otherTitle varchar);'
        cur.execute(query)
        con.commit()
        
        print query
        #process files
        file = open(filePath, 'r' )
        file.readline()
        records = []
        i = 0
        while True:
            line = file.readline()
            if not line:
                if len(records) > 0:
                    insert_records(con,records)
                    records=[]
                break
            line = line.decode('raw_unicode_escape')
            ll = line.split(' ')
            lang = ''
            if ll[2].startswith('<http://es.dbpedia.org/resource'):
                lang = 'ES'
            if ll[2].startswith('<http://fa.dbpedia.org/resource'):
                lang = 'FA'
            if ll[2].startswith('<http://ru.dbpedia.org/resource'):
                lang = 'RU'
            if lang == '':
                continue
            i+=1
            engTitle = extractTitle(ll[0])
            otherTitle = extractTitle(ll[2])
            record = (engTitle, lang , otherTitle)
            records.append(record)
            if len(records) > 100:
                insert_records(con,records)
                records=[]
            if i%10000 == 0:
                print '%d records inserted!', i 
        print i
        
        con.commit()
        # buildIndex
        query = 'CREATE INDEX multilink_engTitle on multilink(engTitle);'
        cur.execute(query)
        con.commit()

    except psycopg2.DatabaseError, e:
        print 'Error %s' % e
        sys.exit(-1)