Python MyMySQL.MyMySQLの例、mymysql.mymysql.MyMySQL.MyMySQL Pythonの例

コード例 #1

0

ファイルを表示

def load_contexts():

  db_csx = MyMySQL(db="csx", user="******", passwd="")
  db_cg = MyMySQL(db="csx_citegraph", user="******", passwd="")

  pubs = get_pubs(db_csx)
  print "%d publications loaded." % len(pubs)

  clusters = {str(pub_id): cluster for pub_id, cluster in pubs}

  citations = get_citations(db_csx)
  print "%d citations loaded." % len(citations)

  found = 0
  for n, (citing, cited) in enumerate(citations):
    cciting = clusters[str(citing)]
    ccited  = clusters[str(cited)]
    context = get_context(db_cg, cciting, ccited)

    if context is None:
      context = ''
    else:
      context = context.replace("'", '"')
      found += 1

    # if (context is not None) and (context != ""):
    try:
      update_graph(db_csx, citing, cited, context)
    except:
      print "Exception when updating 'graph' table."


    print "%d out of %d contexts found." % (found, n + 1)

コード例 #2

0

ファイルを表示

ファイル: download_csx.py プロジェクト: shubhampachori12110095/KDDCUP2016

    def __init__(self, ec2_manager, ec2_instance_id, ec2_instance_dns):
        '''
		Since this is run on the main process, it shouldn't
		open connection or file descriptors.
		'''
        # Zeno task manager
        self.tasks = zeno.TasksManager("tasks",
                                       host=config.DB_HOST,
                                       user=config.DB_USER,
                                       passwd=config.DB_PASSWD)

        # Database connection
        self.db = MyMySQL(db=config.DB_NAME,
                          host=config.DB_HOST,
                          user=config.DB_USER,
                          passwd=config.DB_PASSWD)

        # EC2 manager to issue commands.
        self.ec2_manager = ec2_manager

        # EC2 instance information to be used as a proxy.
        self.ec2_instance_id = ec2_instance_id
        self.ec2_instance_dns = ec2_instance_dns

        # Logging configuration
        self.log = utils.config_logging(
            'downloader',
            stream=sys.stdout,
            level=logging.DEBUG,
            format='%(asctime)s (%(name)s) [%(levelname)6s]: %(message)s',
            datefmt="%Y-%m-%d %H:%M:%S")

コード例 #3

0

ファイルを表示

def write_surveys_queries_file(prefix, npubs=110) :

	db = MyMySQL(db=config.DB_NAME)
	candidates = db.select_query('''SELECT id, substring(title,1,140), year
																	FROM papers
																	WHERE title LIKE '%survey%' AND (year IS NOT NULL)
																	AND (year BETWEEN 1950 AND 2014)''')

	print "Candidates: %s" % len(candidates)

	# Include the word 'survey' for this particular case
	_stop_words_.add("survey")

	# Write candidates to file
	file = open(prefix + ".txt", "w")

	n = 0
	for pub_id, title, year in candidates :

		citations = utils.get_cited(db, pub_id)
		if len(citations)>=20 :
			query = to_query(title)

			print >> file, "%s\t%d\t%s\t%s" % (pub_id, year, title.strip(), query)

			n += 1
			if (n >= npubs) :
				break

	file.close()

コード例 #4

0

ファイルを表示

def manual_queries_topic_graphs(from_dataset, to_dataset) :

	db = MyMySQL(db=to_dataset)
	pub_ids = set(db.select("id", table="papers"))

	from_folder = config.DATA + "query_sets/" + from_dataset + "/manual/"
	to_folder = config.DATA + "query_sets/" + to_dataset + "/manual/"

	for file_name in os.listdir(from_folder) :

		print file_name
		from_file = open(from_folder + file_name, 'r')
		to_file = open(to_folder + file_name, 'w')

		# Read and write back header line
		header = from_file.readline().strip('\n') # ignore header
		print >> to_file, header

		for line in from_file :
			relev, pub_id, title = line.strip().split('\t')
			if (pub_id not in pub_ids) :
				pub_id = ''

			print >> to_file, "%s\t%s\t%s" %(relev, pub_id, title)

		from_file.close()
		to_file.close()

コード例 #5

0

ファイルを表示

ファイル: stats.py プロジェクト: shubhampachori12110095/KDDCUP2016

def get_stats(dataset) :
	
	db = MyMySQL(db=dataset)

	kw_table = 'doc_ngrams' if (dataset=='aminer') else 'doc_kws'
	
	npubs = db.select_query("select count(*) from papers")[0][0]
	nauthors = db.select_query("select count(distinct author_id) from authorships")[0][0]
	nkws = db.select_query("select count(distinct ngram) from %s" % kw_table)[0][0]
	nvenues = db.select_query("select count(distinct venue_id) from papers")[0][0]

	pubs_pubs    = db.select_query("select count(*) from graph")[0][0]
	auths_auths  = db.select_query("select count(*) from coauthorships")[0][0]
	pubs_authors = db.select_query("select count(*) from authorships")[0][0]
	pubs_kws     = db.select_query("select count(*) from %s where value>=%f" % (kw_table, config.MIN_NGRAM_TFIDF))[0][0]
	
#	npubs    = 1
#	nauthors = 2
#	nkws     = 3
#	nvenues  = 4
#	pubs_pubs    = 1
#	auths_auths  = 4
#	pubs_authors = 2
#	pubs_kws     = 3

	
	print "\\hline"	
	print "\\multicolumn{4}{|c|}{%s} \\\\" % TEX_NAMES[dataset]
	print "\\hline"
	print "pubs ($N_p$) & %d & pubs-pubs & %d \\\\" % (npubs, pubs_pubs)
	print "authors   & %d & authors-authors & %d  \\\\" % (nauthors, auths_auths)
	print "keywords ($N_k$)  & %d  & pubs-keywords   & %d \\\\" % (nkws, pubs_kws)
	print "venues ($N_v$)    & %d     & pubs-authors  & %d \\\\" % (nvenues, pubs_authors)

コード例 #6

0

ファイルを表示

    def __init__(self):

        # Zeno task manager
        self.tasks = zeno.TasksManager("tasks",
                                       host=config.DB_HOST,
                                       user=config.DB_USER,
                                       passwd=config.DB_PASSWD)

        # Database connection
        self.db = MyMySQL(db=config.DB_NAME,
                          host=config.DB_HOST,
                          user=config.DB_USER,
                          passwd=config.DB_PASSWD)

        # Logging configuration
        self.log = utils.config_logging(
            'tokenizer',
            stream=sys.stdout,
            level=logging.DEBUG,
            format='%(asctime)s (%(name)s) [%(levelname)6s]: %(message)s',
            datefmt="%Y-%m-%d %H:%M:%S")

        self.MIN_TOKENS = 10

        # Create folders with non existing
        utils.ensure_folder(os.path.dirname(config.TOKENS_PATH))
        utils.ensure_folder(os.path.dirname(config.TOKENS_PATH_PARTS))

コード例 #7

0

ファイルを表示

ファイル: contexts.py プロジェクト: shubhampachori12110095/KDDCUP2016

def get_cited_papers(doc_id) :

	db = MyMySQL(db=DB_NAME, user=DB_USER, passwd=DB_PASSWD)

	return db.select_query("""SELECT r.cited_paper_id, g.start, g.end 
														FROM citations c 
														JOIN citation_groups g ON c.group_id = g.id 
														JOIN refs r ON c.ref_id=r.id 
														WHERE c.paper_id='%s' AND r.cited_paper_id IS NOT NULL""" % doc_id)

コード例 #8

0

ファイルを表示

  def __init__(self, n=None):
    db = MyMySQL(db=config.DB_NAME,
                 user=config.DB_USER,
                 passwd=config.DB_PASSWD)

    rows = db.select(fields=["id", "title", "abstract"], table="papers")
    if n :
      rows = random.sample(rows, n)

    self.pubs = {str(id): (title, abs) for id, title, abs in rows}

コード例 #9

0

ファイルを表示

def main(argv):
    query = None
    usr = None
    output_file = None
    pwd = None
    n = 20

    try:
        opts, _args_ = getopt.getopt(argv, "hq:o:n:u:p:")
    except getopt.GetoptError:
        usage()
        sys.exit(2)

    for opt, arg in opts:
        if opt == '-h':
            sys.exit()

        elif opt == "-q":
            query = arg

        elif opt == "-o":
            output_file = arg

        elif opt == "-n":
            n = int(arg)

        elif opt == "-u":
            usr = arg

        elif opt == "-p":
            pwd = arg

        else:
            print "Invalid option: %s" % opt

    # Check mandatory arguments
    if (not query or not usr or not pwd):
        usage()
        sys.exit(2)

    s = searchers.Searcher(**config.PARAMS)
    pub_ids = s.search(query, limit=n)

    if not output_file:
        output_file = utils.get_graph_file_name(query)

    # Writes the graph structure as a gexf file
    nx.write_gexf(s.graph, output_file)

    # Prints the results
    db = MyMySQL(db='csx', user=usr, passwd=pwd)
    for id in pub_ids:
        print "%12s\t %s" % (
            id, db.select_one("title", table="papers", where="id='%s'" % id))

コード例 #10

0

ファイルを表示

def write_surveys_queries(n=110) :

	db = MyMySQL(db=config.DB_NAME)

	if not os.path.exists(config.QUERY_SETS_PATH) :
		os.mkdir(config.QUERY_SETS_PATH)

	prefix = config.QUERY_SETS_PATH + "surveys"

#	write_surveys_queries_file(prefix, n)
	write_query_set_folder(db, prefix)

コード例 #11

0

ファイルを表示

    def __init__(self):
        self.index = Index(config.INDEX_PATH)

        # Get citation counts and store into dict for fast lookup
        db = MyMySQL(db=config.DB_NAME,
                     user=config.DB_USER,
                     passwd=config.DB_PASSWD)

        ncitations = db.select_query(
            "SELECT cited, COUNT(*) from graph GROUP BY cited")
        self.ncitations = dict(ncitations)

コード例 #12

0

ファイルを表示

def write_citations_queries(name1, n1, name2, n2) :

	db = MyMySQL(db=config.DB_NAME)

	if not os.path.exists(config.QUERY_SETS_PATH) :
		os.mkdir(config.QUERY_SETS_PATH)

	path1 = config.QUERY_SETS_PATH + name1
	path2 = config.QUERY_SETS_PATH + name2

#	write_citations_query_set_files(db, path1, n1, path2, n2)

	write_query_set_folder(db, path1)
	write_query_set_folder(db, path2)

コード例 #13

0

ファイルを表示

def keyword_centric(keyword, from_db, to_db):

    db = MyMySQL(db=from_db)
    pub_ids = db.select("paper_id",
                        table="keywords",
                        where="kw='%s'" % keyword)

    nodes = set()
    new_nodes = set()
    new_nodes.update(pub_ids)

    n = 50000
    while len(nodes) < n:

        new_nodes = get_next_hop(new_nodes)
        nodes.update(new_nodes)
        print len(nodes)

    print "Adding %d nodes." % len(nodes)

    new_db = MyMySQL(db=to_db)

    #	values = ','.join(['%s'%id for id in nodes])
    new_db.insert(into="use_papers", fields=["paper_id"], values=list(nodes))

コード例 #14

0

ファイルを表示

def check_ids(folder) :

	db = MyMySQL(db='csx')

	for i in xrange(1,8) :
		print i
		print

		with open(folder + str(i) + ".txt") as file :

			_header_ = file.readline()
			for line in file :

				relev, pub_id, title = line.strip().split('\t')
				if (len(db.select("id", table="papers", where="id='%s'"%pub_id)) == 0) :
					print "Pub not found:", pub_id

コード例 #15

0

ファイルを表示

def find_ids_unsupervised(titles, index_folder):

    db = MyMySQL(db='csx')
    index = Index(index_folder)

    found = 0
    doc_ids = []
    for title in titles:
        top_docs, scores = index.search(title,
                                        search_fields=["title"],
                                        return_fields=["id"],
                                        return_scores=True,
                                        limit=5)
        #		ids = index.get_documents(top_docs, fields="id")

        # To decide if the most similar title in the index is a hit we check if its score
        # is significantly higher than those of the hits that follow it (second to sixth)

        if len(scores) > 2 and (scores[0] > 2 * np.mean(scores[1:])):
            doc_ids.append(top_docs[0][0])
            found += 1
        else:
            doc_ids.append("")

        # Only enable for debugging and finding a threshold
        if 0:
            print "-------"
            print "%s" % (title)
            print "-------"
            for i, (id, ) in enumerate(top_docs):
                title = db.select_one("title",
                                      table="papers",
                                      where="id='%s'" % id)
                print "%.2f\t%s" % (scores[i], title.encode("UTF-8"))

            if (scores[0] > 2 * np.mean(scores[1:])):
                print "Found!",
                op = '>'
            else:
                print "Not found!",
                op = '<'

            print "(%.2f %s %.2f)\n" % (scores[0], op, 2 * np.mean(scores[1:]))

    return doc_ids

コード例 #16

0

ファイルを表示

ファイル: contexts.py プロジェクト: shubhampachori12110095/KDDCUP2016

def get_citing_papers(doc_id) :
	
	db = MyMySQL(db=DB_NAME, user=DB_USER, passwd=DB_PASSWD)
	
	query = """SELECT r.paper_id, 
										cg.start, cg.end 
										FROM refs r 
										JOIN citations c ON r.id=c.ref_id 
										JOIN citation_groups cg ON c.group_id=cg.id 
										WHERE cited_paper_id='%s' """ % doc_id
	rows = db.select_query(query)

	# Group citations by paper
	citations = defaultdict(list)
	for citing_paper, start, end in rows :
		citations[citing_paper].append((start, end))

	return citations

コード例 #17

0

ファイルを表示

def time_diversity(names, query_set) :


	# Get year of each paper for assembling personalization array next
	db = MyMySQL(db=config.DATASET)
	rows = db.select(["id", "year"], table="papers", where="year is not NULL and year between 1950 and 2013")
	years = {pub_id: year for pub_id, year in rows}

	for name in names :

		file_path = "%s/results/%s/%s/%s.p" % (config.DATA, config.DATASET, query_set, name)

		returned_years = []
		results = cPickle.load(open(file_path, 'r'))
		for _correct, _relevances, returned in results :
			for r in returned :
				if r in years :
					returned_years.append(years[r])

		print "%s\t%.2f\t%.2f" % (name, np.mean(returned_years), np.std(returned_years))

コード例 #18

0

ファイルを表示

def fix_contexts_limits() :
  """
  Updates the contexts on the graph table so that the tokens on the
  extremities are removed. These are usually parts of words, and therefore
  are meaningless.
  """
  db = MyMySQL(db="csx", user="******", passwd="")
  ctxs = db.select(["citing", "cited", "context"], table="graph", where="context != ''")

  print len(ctxs)
  for citing, cited, ctx in progress(ctxs):
    s = ctx.find(" ")
    e = ctx.rfind(" ")

    # print ctx
    # print ctx[s+1:e]
    # print

    db.update(table="graph",
              set="context='%s'" % ctx[s+1:e],
              where="(citing='%s') AND (cited='%s')" % (citing, cited))

コード例 #19

0

ファイルを表示

def get_layer_results(queries, searcher, folder, layer) :

	db = MyMySQL(db=config.DATASET)

	def get_pub(pub_id) :
		return db.select_one("title", table="papers", where="id='%s'" % pub_id)

	def get_author(author_id) :
		return db.select_one("name", table="authors", where="cluster=%s" % author_id)

	def get_venue(venue_id):
		abbrev, name = db.select_one(["abbrev", "name"], table="venues", where="id=%s" % venue_id)
		return " ".join((abbrev, name)).strip()

	def get_keyword(kw):
		return kw

	# Create the folder that will hold the results for this layer
	if not os.path.exists(folder) :
		os.makedirs(folder)

	print "\n%s" % folder

	# Each layer has a different handler to get the name of the entity
	get_entities = {'paper': get_pub,
									'author': get_author,
									'venue': get_venue,
									'ngram': get_keyword}

	# Now fetch the results and save them
	for query in queries :

		file_path = os.path.join(folder, query.replace(' ', '+') + ".txt")
		print " ", query

		entity_ids = searcher.search(query, rtype=layer, limit=50)
		with open(file_path, 'w') as file :
			for eid in entity_ids:
				name = get_entities[layer](eid).strip()
				print >> file, "%s" % (name.encode("UTF-8"))

コード例 #20

0

ファイルを表示

    def __init__(self):
        ''' Converter constructor.'''

        # Zeno task manager
        self.tasks = zeno.TasksManager("tasks",
                                       host=config.DB_HOST,
                                       user=config.DB_USER,
                                       passwd=config.DB_PASSWD)

        # Database connection
        self.db = MyMySQL(db=config.DB_NAME,
                          host=config.DB_HOST,
                          user=config.DB_USER,
                          passwd=config.DB_PASSWD)

        # Logging configuration
        self.log = utils.config_logging(
            'converter',
            stream=sys.stdout,
            level=logging.DEBUG,
            format='%(asctime)s (%(name)s) [%(levelname)6s]: %(message)s',
            datefmt="%Y-%m-%d %H:%M:%S")

コード例 #21

0

ファイルを表示

def get_texts(pub_ids, use_title=True, use_abs=True) :
  '''
  This is a non-batch version. Much slower but more
  memory efficient.
  '''
  db = MyMySQL(db='csx', user='******', passwd='')

  fields = []
  if use_title: fields.append("title")
  if use_abs: fields.append("abstract")

  texts = []
  for pub_id in pub_ids:
    text_fields = db.select_one(fields=fields, table="papers", where="id='%s'" % pub_id)
    text = ''
    for tf in text_fields:
      if tf is not None:
        text += tf

    texts.append(text)

  return texts

コード例 #22

0

ファイルを表示

    def search(self, query, exclude=[], limit=20, force=False):

        # import warnings
        # warnings.filterwarnings('error')

        file_path = config.CITERANK_FILE_PATH
        if not os.path.exists(file_path):
            g = nx.DiGraph()
            g.add_edges_from(model.get_all_edges())

            # Remove documents from the exclude list
            g.remove_nodes_from(exclude)

            # Get year of each paper for assembling personalization array next
            db = MyMySQL(db=config.DATASET)
            rows = db.select(["id", "year"], table="papers")
            years = {}
            for pub_id, year in rows:
                if year is not None:
                    years[pub_id] = year

            # Calculate the median to use in the missing values
            year_median = np.median(years.values())

            # Create a personalization array by exponentially decaying
            # each paper's factor by its age
            pers = {}
            for node in g.nodes():
                if (node not in years) or (years[node] < 1960) or (years[node]
                                                                   > 2013):
                    years[node] = year_median

                pers[node] = np.exp(float(years[node] - 2013) / self.tau)
            #				try :
            #				except Warning:
            #					print "Warning!"
            #					print node
            #					print year
            #					print

            print "Running PageRank with %d nodes and age defined personalization vector." % g.number_of_nodes(
            )
            r = nx.pagerank(g, personalization=pers)

            print "Writing results"
            cPickle.dump(r, open(file_path, "w"))

        # Loads cached page rank values for every node
        r = cPickle.load(open(file_path, "r"))

        # Sorts documents decreasingly by page rank value
        ids, _score_ = zip(
            *sorted(r.items(), key=lambda (k, v): v, reverse=True))

        # Fetches all document that have at least one of the terms.
        # Store them in a set for fast lookup
        pub_ids = self.index.search(query,
                                    search_fields=["title", "abstract"],
                                    return_fields=["id"],
                                    ignore=exclude)

        pub_ids = set([pid for (pid, ) in pub_ids])

        results = []
        for id in ids:
            if id in pub_ids:
                results.append(id)
                if len(results) == limit:
                    break

        return results

コード例 #23

0

ファイルを表示

'''
Created on Jun 29, 2015

@author: luamct
'''
from mymysql.mymysql import MyMySQL
import random
import networkx as nx
from utils import progress

db = MyMySQL(db='csx')


def get_cited(pub_id):
    return db.select("cited", table="graph", where="citing='%s'" % pub_id)


def get_citing(pub_id):
    return db.select("citing", table="graph", where="cited='%s'" % pub_id)


def get_neighbours(pub_id):
    return db.select(["citing", "cited"],
                     table="graph",
                     where="citing='%s' OR cited='%s'" % (pub_id, pub_id))


def depth_walk():

    ids = db.select("id", table="papers", limit=10000)

コード例 #24

0

ファイルを表示

from contexts import contexts
from collections import Counter, defaultdict
import nltk
import utils
from utils import tokenize
import logging
import sys
from config import DATA, TOKENS_PATH, DB_NAME
from sklearn.feature_extraction.text import TfidfVectorizer

stopwords = set(nltk.corpus.stopwords.words('english'))

NUMBER = re.compile("\d+$")
VARIABLE = re.compile("\w\d$")

db = MyMySQL(db=DB_NAME)


def filter_tokens(tokens):
    ''' 
	Filter some tokens before the analysis (infrequent, numbers, variables names).
	'''
    valid = lambda (token, _freq) : \
         not (token in stopwords) and \
         (len(token) >= 3) and \
         not re.match(NUMBER, token)

    return filter(valid, tokens)


def read_line(line):

コード例 #25

0

ファイルを表示

import lxml.html
from pylucene import DocField, Index
from mymysql.mymysql import MyMySQL
import random
from utils import progress
import config
import os
from random import Random

#URL_TEMPLATE = "http://www.informatik.uni-trier.de/~ley/db/conf/index-%s.html"
#URL_TEMPLATE = "http://dblp.uni-trier.de/db/hc/conf/index-%s.html"
URL_TEMPLATE = "http://dblp.uni-trier.de/db/%s"

IGNORE_TERMS = ["proceedings", "proc."]

db = MyMySQL(db='aminer')


def download_venues(venue_type):
    '''
	Venue types available: ['conf', 'journals'].
	'''

    folder = config.DATA + ("venues/html/%s/" % venue_type)
    url = URL_TEMPLATE % venue_type

    pos = 1
    while (True):

        print "Processing %d-%d" % (pos, pos + 99)

コード例 #26

0

ファイルを表示

ファイル: keywords.py プロジェクト: shubhampachori12110095/KDDCUP2016

Created on May 26, 2015

@author: luamct
'''
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.preprocessing.label import MultiLabelBinarizer
from mymysql.mymysql import MyMySQL
from collections import Counter, defaultdict
import numpy as np
import random
import nltk
from utils import PubTexts

db = MyMySQL(db='csx', user='******', passwd='')

#rows = db.select(fields=["id", "title", "abstract"], table="papers")
#pubs = {str(id): (title, abs) for id, title, abs in rows}
#pubs = {str(id): (title + ' ' + abs) for id, title, abs in rows}

MAX_KWS = 10


def get_keywords(min=1):

    kws = db.select(fields=("id", "kw"),
                    table=("papers", "keywords"),
                    join_on=("id", "paper_id"))
    count = Counter([kw for _pid, kw in kws])

コード例 #27

0

ファイルを表示

@author: hugo
'''

import os
import sys
import time
import numpy as np
import config
from collections import defaultdict
from mymysql.mymysql import MyMySQL
from datasets.mag import get_selected_pubs
from ranking.kddcup_searchers import SimpleSearcher, Searcher
from evaluation.kddcup_expt import get_results_file, save_results

db = MyMySQL(db=config.DB_NAME, user=config.DB_USER, passwd=config.DB_PASSWD)


def rank_affils(selected_affils,
                conf_name,
                year,
                searcher,
                show=True,
                results_file=None):
    conf_id = db.select("id",
                        "confs",
                        where="abbr_name='%s'" % conf_name,
                        limit=1)[0]
    start = time.time()

    if searcher.name() == "SimpleSearcher":

コード例 #28

0

ファイルを表示

ファイル: aminer.py プロジェクト: shubhampachori12110095/KDDCUP2016

'''
Created on Jun 9, 2015

@author: luamct
'''
from collections import defaultdict
from mymysql.mymysql import MyMySQL
from utils import progress, plot
import sys
import config
from pylucene import Index, DocField

IGNORE_TERMS = ["proceedings", "proc."]

db = MyMySQL(db='aminer', user='******', passwd='')


def load_existing_venues():
    rows = db.select(fields=["id", "name"], table="venues")
    return {name: int(id) for id, name in rows}


def save_venue(id, name):
    db.insert(into="venues", fields=["id", "name"], values=[id, name])


def save_citations(id, cits):
    values = [(id, cid) for cid in cits]
    db.insert(into="graph", fields=["citing", "cited"], values=values)