Example #1
0
def make_us_states_name_db():
    def split(l):
        splits = l.split(",")
        return splits[1], splits[3].lower()

    root_folder = "data/us_states"
    pkl_file = US_STATES_GENDER_FILE
    db = {}
    for f_name in os.listdir(root_folder):
        f_name = "%s/%s" % (root_folder, f_name)
        with open(f_name) as f:
            print(f_name)
            for line in f.readlines():
                gender, name = split(line)
                node = db.get(name, None)
                if node is None:
                    node = O()
                    node.name = name
                    node.females = 0
                    node.males = 0
                if gender == 'F':
                    node.females += 1
                elif gender == 'M':
                    node.males += 1
                db[name] = node
    with open(pkl_file, "wb") as f:
        pkl.dump(db, f, pkl.HIGHEST_PROTOCOL)
    return db
Example #2
0
def build_graph(index, train_x, train_y, cite_map, use_references=True, from_cache=True):
  if use_references:
    cached = "cache/graphs/%d_ref.pkl" % index
  else:
    cached = "cache/graphs/%d.pkl" % index
  if os.path.isfile(cached) and from_cache:
    with open(cached) as f:
      return cPkl.load(f)
  vocab_file = 'cache/vocabulary/%d.pkl' % index
  vocabulary, reverse_vocabulary = construct_vocabulary(train_x, vocab_file)
  vocabulary_words = set(vocabulary.keys())
  analyze = predict.analyzer()
  doc_map = {}
  for x, y in zip(train_x, train_y):
    tokens = set(analyze(x.raw)).intersection(vocabulary_words)
    # add_tokens(tokens, nodes)
    doc = Doc(x.id, tokens, y)
    doc_map[x.id] = doc
  edges = np.zeros((VOCAB_SIZE, VOCAB_SIZE), dtype=np.int16)
  for i, x in enumerate(train_x):
    if i % 1000 == 0:
      print(i)
    tokens = list(doc_map[x.id].tokens)
    make_self_edges(tokens, edges, vocabulary)
    if use_references:
      references = cite_map.get(x.id, [])
      for reference in references:
        if reference not in doc_map:  # belongs to test set
          continue
        make_edges(tokens, list(doc_map[reference].tokens), edges, vocabulary)
  word_network = O(doc_map=doc_map, edges=edges)
  with open(cached, "wb") as f:
    cPkl.dump(word_network, f, cPkl.HIGHEST_PROTOCOL)
  return word_network
Example #3
0
def make_indian_name_db():
    def split(l):
        splits = l.split()
        return int(splits[2]), int(splits[3]), splits[4].lower()

    inp_file = "data/ind_names.txt"
    db = {}
    with open(inp_file) as f:
        index = 0
        for line in f.readlines():
            index += 1
            if index % 1000 == 0:
                print("Line : %d", index)
            males, females, name = split(line)
            node = db.get(name, None)
            if node is None:
                node = O()
                node.name = name
                node.females = 0
                node.males = 0
            node.females += females
            node.males += males
            db[name] = node
    pkl_file = INDIAN_GENDER_FILE
    with open(pkl_file, "wb") as f:
        pkl.dump(db, f, pkl.HIGHEST_PROTOCOL)
    return db
Example #4
0
def make_name_db():
    root_folder = "data/us_names"
    pkl_file = US_GENDER_FILE
    db = {}
    for f_name in os.listdir(root_folder):
        f_name = "%s/%s" % (root_folder, f_name)
        with open(f_name) as f:
            print(f_name)
            for line in f.readlines():
                [name, gender, count] = line.split(",")
                name = name.lower()
                node = db.get(name, None)
                if node is None:
                    node = O()
                    node.name = name
                    node.females = 0
                    node.males = 0
                if gender == 'F':
                    node.females += int(count)
                elif gender == 'M':
                    node.males += int(count)
                db[name] = node
    with open(pkl_file, "wb") as f:
        pkl.dump(db, f, pkl.HIGHEST_PROTOCOL)
    return db
Example #5
0
 def avg_score(metrics_arr):
     accuracies, precisions, recalls, f_scores = [], [], [], []
     for metrics in metrics_arr:
         accuracies.append(metrics.accuracy)
         precisions.append(metrics.precision)
         recalls.append(metrics.recall)
         f_scores.append(metrics.f_score)
     score = Metrics()
     score.accuracy = O(median=Metrics.median(accuracies),
                        iqr=Metrics.iqr(accuracies))
     score.precision = O(median=Metrics.median(precisions),
                         iqr=Metrics.iqr(precisions))
     score.recall = O(median=Metrics.median(recalls),
                      iqr=Metrics.iqr(recalls))
     score.f_score = O(median=Metrics.median(f_scores),
                       iqr=Metrics.iqr(f_scores))
     return score
Example #6
0
 def avg_score(metrics_arr):
     accuracies, precisions, recalls, f_scores, specificities = [], [], [], [], []
     pre_reject_misseds = []
     for metrics in metrics_arr:
         accuracies.append(metrics.accuracy)
         precisions.append(metrics.precision)
         recalls.append(metrics.recall)
         f_scores.append(metrics.f_score)
         specificities.append(metrics.specificity)
         pre_reject_misseds.append(metrics.pre_reject_missed /
                                   (metrics.pre_reject + metrics.EPS))
     score = O()
     score.accuracy = O(median=Metrics.median(accuracies),
                        iqr=Metrics.iqr(accuracies))
     score.precision = O(median=Metrics.median(precisions),
                         iqr=Metrics.iqr(precisions))
     score.recall = O(median=Metrics.median(recalls),
                      iqr=Metrics.iqr(recalls))
     score.f_score = O(median=Metrics.median(f_scores),
                       iqr=Metrics.iqr(f_scores))
     score.specificity = O(median=Metrics.median(specificities),
                           iqr=Metrics.iqr(specificities))
     score.pre_reject_missed = O(median=Metrics.median(pre_reject_misseds),
                                 iqr=Metrics.iqr(pre_reject_misseds))
     return score
Example #7
0
def get_venues():
    db = DB.get()
    cur = db.cursor()
    cur.execute('SELECT * FROM venues')
    venues = OrderedDict()
    for row in cur.fetchall():
        venue = O()
        venue.id = str(row[0])
        venue.acronym = row[1]
        venue.name = row[2]
        venue.impact = int(row[3])
        venue.is_conference = True if row[4] == 1 else False
        venues[venue.id] = venue
    DB.close()
    return venues
Example #8
0
def vectorize(papers, iterations=ITERATIONS):
    miner, graph, lda_model, vocab = get_graph_lda_data(iterations=iterations)
    # vectorizer = text.CountVectorizer(stop_words=STOP_WORDS, token_pattern=TOKEN_PATTERN)
    docs = [
        paper.abstract if paper.abstract is not None
        and paper.abstract != 'None' else paper.title for paper in papers
    ]
    doc_2_vec = miner.vectorizer.transform(docs)
    doc_2_vec_array = doc_2_vec.toarray()
    transformed = lda_model.transform(doc_2_vec_array)
    report(lda_model, vocab)
    for paper, t, d_2_v in zip(papers, transformed, doc_2_vec_array):
        paper.transformed = t
        paper.doc_2_vec = d_2_v
    return O(miner=miner,
             graph=graph,
             lda_model=lda_model,
             vocab=vocab,
             doc_2_vec=doc_2_vec)
Example #9
0
    plt.savefig(fig_name, bbox_inches='tight')
    plt.clf()


# Settings for 10 rows and 5 columns
settings_10_5 = O(
    fig_size=(8, 8),
    col_axes=[
        0.3,  # col dendo left
        0.81,  # col dendo bottom
        0.36,  # col dendo width
        0.15
    ],  # col dendo height
    row_axes=[
        0.0,  # row dendo left
        0.055,  # row dendo bottom
        0.23,  # row dendo width
        0.69
    ],  # row dendo height
    plot_axes=[
        0.10,  # hm left
        0.05,  # hm bottom
        0.7,  # hm width
        0.7
    ],  # hm height
)

# Settings for 10 rows and 4 columns
settings_10_4 = O(
    fig_size=(8, 8),
    col_axes=[
Example #10
0
# logging.getLogger('lda').setLevel(logging.ERROR)

# GRAPHS = {
#   "v5": "data/citemap_v10.csv",
#   "v2": "data/citemap_v4.csv"
# }
GRAPH_CSV = "data/citemap_v10.csv"

# For 11 TOPICS
ALPHA = 0.22359
BETA = 0.53915
ITERATIONS = 100

MIN_DIVERSITY_SCORE = 0.075

THE = O()
THE.permitted = "all"  # conference/journal/all
THE.version = "v5"
THE.use_numeric = False
THE.random_state = 0
THE.IGNORE_VENUES = {"v5": set(), "v2": {"ICPC", "MDLS", "SOSYM", "SCAM"}}

STOP_WORDS = text.ENGLISH_STOP_WORDS.union([
    'software', 'engineering', 'paper', 'study', 'based', 'results',
    'approach', 'case', 'workshop', 'international', 'research', 'conference',
    'introduction', 'editors', 'article', 'issue', 'month', 'copyright',
    'special', 'used', 'using', 'use', 'studies', 'review', 'editorial',
    'report', 'book', 'ieee', 'published', 'science', 'column', 'author',
    'proposed', 'icse', 'article', 'year', 'articles', 'page', '2000', '2004',
    'papers', 'computer', 'held', 'editor'
])
Example #11
0
import sys
import os

sys.path.append(os.path.abspath("."))
sys.dont_write_bytecode = True

__author__ = "bigfatnoob"

from utils.lib import O

ROOT_SCOPE = "__ROOT__"

VAR_TYPE = O(GLOBAL="global",
             LOCAL="local",
             ARG="arg",
             VARARG="vararg",
             KWARG="kwarg")

SCOPE_SEPARATOR = "->"

PRIMITIVES = {'int', 'long', 'float', 'str', 'bool'}

GENERATED_PREFIX = "generated_py_"

TEMPORARY_PREFIX = "tmp_py_"

FUNCTION_PREFIX = "func_"

METHOD_WAIT_TIMEOUT = 1
Example #12
0
# For 11 TOPICS
N_TOPICS = 11
ALPHA = 0.22359
BETA = 0.53915
ITERATIONS = 100
# TOPICS = ["TPC %d" % d for d in range(11)]
TOPICS = [
    "Design", "Testing", "Modelling", "Mobile", "Energy", "Defects",
    "SourceCode", "WebApps", "Configuration", "Developer", "Mining"
]
TOPIC_THRESHOLD = 3

dendo_11_settings = O(
    fig_size=(6, 8),
    col_axes=[0.25, 0.65, 0.50, 0.11],
    row_axes=[0.0, 0.215, 0.2, 0.375],
    plot_axes=[0.25, 0.1, 0.63, 0.6],
)

dendo_14_settings = O(
    fig_size=(8, 8),
    col_axes=[0.25, 0.65, 0.50, 0.11],
    row_axes=[0.0, 0.2, 0.21, 0.4],
    plot_axes=[0.25, 0.1, 0.63, 0.6],
)

dendo_16_settings = O(
    fig_size=(8, 8),
    col_axes=[0.25, 0.65, 0.50, 0.11],
    row_axes=[0.0, 0.225, 0.21, 0.35],
    plot_axes=[0.25, 0.1, 0.63, 0.6],
Example #13
0
from __future__ import print_function, division
import sys
import os
sys.path.append(os.path.abspath("."))
sys.dont_write_bytecode = True
from utils.lib import O

_dend_7_8 = O(fig_size=(8, 8),
              col_axes=[0.3, 0.6, 0.5, 0.16],
              row_axes=[0.0, 0.13, 0.21, 0.45],
              plot_axes=[0.3, 0.05, 0.63, 0.6])

_dend_7_9 = O(fig_size=(8, 8),
              col_axes=[0.25, 0.6, 0.51, 0.18],
              row_axes=[0.0, 0.15, 0.21, 0.4],
              plot_axes=[0.25, 0.05, 0.63, 0.6])

_dend_7_13 = O(fig_size=(8, 8),
               col_axes=[0.25, 0.52, 0.51, 0.18],
               row_axes=[0.0, 0.21, 0.22, 0.28],
               plot_axes=[0.25, 0.05, 0.63, 0.6])

_dend_7_14 = O(fig_size=(8, 8),
               col_axes=[0.25, 0.52, 0.51, 0.18],
               row_axes=[0.0, 0.22, 0.21, 0.26],
               plot_axes=[0.25, 0.05, 0.63, 0.6])

_dend_11_18 = O(fig_size=(8, 8),
                col_axes=[0.3, 0.63, 0.63, 0.18],
                row_axes=[0.0, 0.23, 0.19, 0.39],
                plot_axes=[0.3, 0.05, 0.63, 0.6])
Example #14
0
from algorithms.parallel.de.de import DE as DE_P
from mpi4py import MPI
from time import clock, sleep
from utils.lib import O, report
import utils.sk as sk
from problems.pom3.pom3a import POM3A
from problems.pom3.pom3b import POM3B
from problems.pom3.pom3c import POM3C
from problems.pom3.pom3d import POM3D
from problems.xomo.xomo import XOMO

COMM = MPI.COMM_WORLD
RANK = COMM.rank
SIZE = COMM.size

settings = O(runs=20)


def _run_parallel():
    model = DTLZ2(3)
    opt = DE_P(model)
    times, convs, dives = [], [], []
    for i in range(settings.runs):
        print(i)
        start = clock()
        goods = DE_P.run(opt, id=i)
        if RANK == 0:
            times.append(clock() - start)
            convs.append(opt.convergence(goods))
            dives.append(opt.diversity(goods))
    if RANK == 0: