예제 #1
0
 def __init__(self):
     bh = BackupHandler(
         relative_path('experimental/code_suggest/output/backup'))
     elem_counts = bh.load('elem_pyplot_counts_0404')
     self.all_elems = set(elem_counts.keys())
     self.all_elem_counts = elem_counts
     self.enormer = ElementNormalizer()
예제 #2
0
def code_examples():
  """
  Yield code examples.

  """

  global all_codes1, all_codes2, all_codes3

  # 15770 code examples mined from SO answers in threads that are tagged
  # "matplotlib".
  if not all_codes1:
    print 'Loading SO code examples...'
    bh1 = BackupHandler(relative_path('experimental/code_suggest'))
    all_codes1 = bh1.load('all_codes')
    print '%d examples from SO'%len(all_codes1)

  for code in all_codes1:
    yield code

  # print 'WARNING: mine_element.py ignoring all GitHub code examples...'
  # """
  if not all_codes2:
    # 8732 code examples (including 395 IPython Notebook files) mined from
    # GitHub repositories that contain "matplotlib".
    print 'Loading GitHub code examples...'
    bh2 = BackupHandler(relative_path('experimental/code_suggest/output/backup'))
    all_codes2 = bh2.load('all_codes_github_1k_repo_0322')
    print '%d examples from GitHub'%len(all_codes2)

  for code in all_codes2:
    yield code
  # """

  if not all_codes3:
    # 21993 code examples extracted by Shiyan from the Web
    print 'Loading Web code examples'
    bh3 = BackupHandler(relative_path('experimental/mining/output'))
    all_codes3 = bh3.load('codes_shiyan_0331_web')
    print '%d examples from Web Shiyan'%len(all_codes3)

  for code in all_codes3:
    yield code
예제 #3
0
    def __init__(self):
        plot_commands = get_plot_commands()
        pyplot_fu = get_pyplot_fu()
        self.plot_commands = plot_commands
        self.nonplot_commands = [
            f for f in pyplot_fu.keys() if not f in plot_commands
        ]
        print 'CodeSuggest: extracted %d plot commands' % len(plot_commands)

        # Load all code examples of plotting commands from db into memory
        # These are generated by index_examples.py
        print 'CodeSuggest: Loading code examples and pregenerated SVGs...'
        db = sqlite3.connect(relative_path('demo/data/code.sqlite3'))
        cursor = db.cursor()
        cursor.execute("SELECT func_id, code, svg FROM example")
        code_example_lookup = {}  # [func_id] = [(code, svg)]
        count_code_examples = 0
        for func_id, code, svg in cursor.fetchall():
            count_code_examples += 1
            if not func_id in code_example_lookup:
                code_example_lookup[func_id] = []
            code_example_lookup[func_id].append({'code': code, 'svg': svg})
        # Sort it again
        for func_id in code_example_lookup:
            code_example_lookup[func_id] = sorted(
                code_example_lookup[func_id],
                key=lambda x: get_effective_code_len(x['code']))
        self.code_example_lookup = code_example_lookup
        print 'CodeSuggest: Loaded %d code examples (with svgs)...' % count_code_examples
        db.close()

        # Load element_index generated by experimental/code_suggest/mine_argvs.py
        # bh = BackupHandler(relative_path('demo/data'))
        # self.element_index = bh.load('element_index')
        # print 'Loaded element_index with %d keys'%len(self.element_index)

        # Load element value counts
        bh2 = BackupHandler(
            relative_path('experimental/code_suggest/output/backup'))
        self.elem_val_counts = bh2.load(
            'elem_pyplot_value_counts_0404')  # [elem][val] = count
        for elem_id in self.elem_val_counts:
            self.elem_val_counts[elem_id] = sorted(
                self.elem_val_counts[elem_id].items(), key=lambda x: -x[1])

        self.func_position_finder = FuncPositionFinder()
예제 #4
0
  def __init__(self):
    plot_commands = get_plot_commands()
    pyplot_fu = get_pyplot_fu()
    self.plot_commands = plot_commands
    self.nonplot_commands = [f for f in pyplot_fu.keys() if not f in plot_commands]
    print 'CodeSuggest: extracted %d plot commands'%len(plot_commands)

    # Load all code examples of plotting commands from db into memory
    # These are generated by index_examples.py
    print 'CodeSuggest: Loading code examples and pregenerated SVGs...'
    db = sqlite3.connect(relative_path('demo/data/code.sqlite3'))
    cursor = db.cursor()
    cursor.execute("SELECT func_id, code, svg FROM example")
    code_example_lookup = {}  # [func_id] = [(code, svg)]
    count_code_examples = 0
    for func_id, code, svg in cursor.fetchall():
      count_code_examples += 1
      if not func_id in code_example_lookup:
        code_example_lookup[func_id] = []
      code_example_lookup[func_id].append({'code': code, 'svg':svg})
    # Sort it again
    for func_id in code_example_lookup:
      code_example_lookup[func_id] = sorted(
        code_example_lookup[func_id],
        key=lambda x:get_effective_code_len(x['code']))
    self.code_example_lookup = code_example_lookup
    print 'CodeSuggest: Loaded %d code examples (with svgs)...'%count_code_examples
    db.close()

    # Load element_index generated by experimental/code_suggest/mine_argvs.py
    # bh = BackupHandler(relative_path('demo/data'))
    # self.element_index = bh.load('element_index')
    # print 'Loaded element_index with %d keys'%len(self.element_index)

    # Load element value counts
    bh2 = BackupHandler(relative_path('experimental/code_suggest/output/backup'))
    self.elem_val_counts = bh2.load('elem_pyplot_value_counts_0404')  # [elem][val] = count
    for elem_id in self.elem_val_counts:
      self.elem_val_counts[elem_id] = sorted(
        self.elem_val_counts[elem_id].items(), key=lambda x:-x[1])

    self.func_position_finder = FuncPositionFinder()
예제 #5
0
3. take average per function

Output:
 - a dictionary: [function] = average_position
   average position: between 0 (beginning of code) and 1 (end of code).
"""

import ast
from collections import defaultdict

from codemend import BackupHandler, relative_path
from codemend.models.annotate_code_with_api import get_fu_fau, findCallNodes, extractCallComponents

fu, fau = get_fu_fau()
bh = BackupHandler(relative_path('experimental/code_suggest'))
all_codes = bh.load('all_codes')
print 'There are %d code examples in total'%len(all_codes)

pos_sum = defaultdict(float)  # [f] = sum
pos_cnt = defaultdict(int)  # [f] = count
for code in all_codes:
  try:
    node = ast.parse(code)
  except SyntaxError:
    continue
  calls = findCallNodes(node)
  called_funcs = [extractCallComponents(x)[0] for x in calls]
  called_funcs = filter(lambda x: x in fu, called_funcs)
  if len(calls) < 3:
    continue
  for i, f in enumerate(called_funcs):
예제 #6
0
- #3: Not recommending elements that occur too infrequently.
- #4: When a function is not used before, and its argv is recommended, we strip
  the "@", and recommend the function first, followed by the argv. e.g.
  [pie@0, pie] => [pie, pie@0].

"""

from codemend import BackupHandler, relative_path
from codemend.demo.code_suggest import get_plot_commands
from codemend.models.baseline2 import SuggestItem

plot_commands = get_plot_commands()
plot_commands_set = set(plot_commands)

bh = BackupHandler(relative_path('experimental/code_suggest/output/backup'))
elem_counts = bh.load('elem_pyplot_counts_0404')


def prune(used_elems, suggest_elems):
    for elem in used_elems:
        assert isinstance(elem, basestring)
    for elem in suggest_elems:
        assert isinstance(elem, SuggestItem), type(elem)

    used_elems_set = set(used_elems)
    used_funcs = map(get_func_name, used_elems)
    used_funcs_set = set(used_funcs)
    has_used_plot_commands = any(
        map(lambda x: x in plot_commands_set, used_funcs))

    filtered_suggests = []
예제 #7
0
 def __init__(self):
   bh = BackupHandler(relative_path('demo/data'))
   self.pos_ave = bh.load('pos_ave')
   print 'FuncPositionFinder: loaded %d average positions for functions'%len(self.pos_ave)
예제 #8
0
 def __init__(self):
     bh = BackupHandler(relative_path('demo/data'))
     self.pos_ave = bh.load('pos_ave')
     print 'FuncPositionFinder: loaded %d average positions for functions' % len(
         self.pos_ave)
예제 #9
0
from codemend.models.extract_so_code import load_threads, Thread, Answer
from codemend import BackupHandler, relative_path

if __name__ == '__main__':
  bh_dir = relative_path('models/output/backup')
  bh = BackupHandler(bh_dir)

  try:
    threads = bh.load('mpl_threads')
  except AssertionError:
    threads = list(load_threads(
      qfilter="Tags LIKE '%<matplotlib>%' AND AnswerCount > 0 AND Score >= 0",
      afilter="Score >= 0 ORDER BY Score DESC LIMIT 3"))
    bh.save('mpl_threads', threads)

  with open(relative_path('models/output/mpl_so_titles.txt'), 'w') as writer:
    for t in threads:
      writer.write('%d\t%s\n'%(t.qid, t.qtitle.encode('utf-8')))
예제 #10
0
파일: eval2.py 프로젝트: ronxin/codemend
 def __init__(self):
   bh = BackupHandler(relative_path('experimental/code_suggest/output/backup'))
   elem_counts = bh.load('elem_pyplot_counts_0404')
   self.all_elems = set(elem_counts.keys())
   self.all_elem_counts = elem_counts
   self.enormer = ElementNormalizer()
예제 #11
0
The table is like this:
  (func_id, code, svg)

There are at most 20 (shortest) examples per func_id.

"""
import sqlite3

from codemend import BackupHandler, relative_path

if __name__ == '__main__':

    print 'Reading SVGs and code examples. Takes 7.3 seconds...'
    bh = BackupHandler('.')
    svgs = bh.load('svgs')
    all_codes = bh.load('all_codes')
    plotcommands_examples = bh.load(
        'plotcommands_examples')  # [plot_command] = [example_idx]

    db = sqlite3.connect(relative_path('demo/data/code.sqlite3'))
    cursor = db.cursor()

    cursor.executescript("""
    DROP TABLE IF EXISTS example;

    CREATE TABLE example (
      func_id TEXT NOT NULL,
      code TEXT NOT NULL,
      svg TEXT
    );
예제 #12
0
def get_effective_code_len(code):
    """
  Number of characters in a code example. Not counting lines with "import"

  """
    lines = code.split('\n')
    lines = filter(lambda x: 'import' not in x.split(), lines)
    return len('\n'.join(lines))


if __name__ == '__main__':

    print 'Reading SVGs and code examples. Takes 7.3 seconds...'
    bh = BackupHandler('.')
    svgs = bh.load('svgs')
    all_codes = bh.load('all_codes')

    print 'Loading functions that are plotting commands'
    # Copied from code_suggest.py
    import csv
    import pattern.en
    # Load csv file of pyplot summary
    pyplot_fu = {}  # [func] = utter
    print 'CodeSuggest: Loading pyplot fu...'
    with open('../../docstring_parse/pyplot_fu.csv', 'rb') as csvfile:
        reader = csv.reader(csvfile)
        next(reader, None)  # skip the header
        for f, u in reader:
            if not u:
                continue
예제 #13
0
  def __init__(self, w2v_model, all_elem_counts, maxngram=1,
               name=None, use_lemma=True,
               heuristic=False, use_coke=False):
    """
    w2v_model can be a binary vectors file, or a loaded gensim model instance.

    """
    self.maxngram = maxngram
    self.name = name
    self.use_lemma = use_lemma
    assert isinstance(all_elem_counts, dict)
    self.all_elem_counts = all_elem_counts
    self.heuristic = heuristic
    self.use_coke = use_coke

    if isinstance(w2v_model, basestring):
      self.model = load_gensim_from_binary_file(w2v_model)
      self.model.filename = w2v_model.split('/')[-1]
      if not self.name:
        self.name = self.model.filename
    else:
      assert isinstance(w2v_model, Word2Vec)
      self.model = w2v_model
      if not self.name:
        if hasattr(self.model, 'filename'):
          self.name = self.model.filename


    self.model.init_sims()  # normalize the vectors

    self.enormer = ElementNormalizer()

    if self.use_coke:
      bh = BackupHandler(relative_path('models/output/backup'))
      coke_file = 'coke_0329'
      if not bh.exists(coke_file):
        raise ValueError('Coke file does not exist: %s'%coke_file)
      self.coke = bh.load(coke_file)

    print 'Trying to load element indexes from cache ...'
    bh = BackupHandler(relative_path('models/output/backup'))
    elem_index_backup_name = self.model.filename + '_elem_index'
    if bh.exists(elem_index_backup_name):
      self.idfs, self.elems, self.elem_lookup, self.vecmat = bh.load(elem_index_backup_name)

    else:
      print 'Word2vecBaseline building element indexes...'

      fu, fau = get_fu_fau()
      self.idfs = self.get_idf(fu.values() + fau.values())

      self.elems = sorted(self.all_elem_counts.keys())
      self.elem_lookup = dict((y,x) for (x,y) in enumerate(self.elems))
      vecs = []
      for e in self.elems:
        u = doc_serve.get_training_doc(e, True)
        v = self.get_bow_representation(u)
        vecs.append(v)
      self.vecmat = np.array(vecs)
      assert self.vecmat.shape == (len(self.elems), self.model.vector_size)

      bh.save(elem_index_backup_name, (self.idfs, self.elems, self.elem_lookup, self.vecmat))

      print 'Finished building indexes.'
예제 #14
0
def get_effective_code_len(code):
  """
  Number of characters in a code example. Not counting lines with "import"

  """
  lines = code.split('\n')
  lines = filter(lambda x: 'import' not in x.split(), lines)
  return len('\n'.join(lines))


if __name__ == '__main__':

  print 'Reading SVGs and code examples. Takes 7.3 seconds...'
  bh = BackupHandler('.')
  svgs = bh.load('svgs')
  all_codes = bh.load('all_codes')




  print 'Loading functions that are plotting commands'
  # Copied from code_suggest.py
  import csv
  import pattern.en
  # Load csv file of pyplot summary
  pyplot_fu = {}  # [func] = utter
  print 'CodeSuggest: Loading pyplot fu...'
  with open('../../docstring_parse/pyplot_fu.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile)
    next(reader, None)  # skip the header
예제 #15
0
    def __init__(self,
                 w2v_model,
                 all_elem_counts,
                 maxngram=1,
                 name=None,
                 use_lemma=True,
                 heuristic=False,
                 use_coke=False):
        """
    w2v_model can be a binary vectors file, or a loaded gensim model instance.

    """
        self.maxngram = maxngram
        self.name = name
        self.use_lemma = use_lemma
        assert isinstance(all_elem_counts, dict)
        self.all_elem_counts = all_elem_counts
        self.heuristic = heuristic
        self.use_coke = use_coke

        if isinstance(w2v_model, basestring):
            self.model = load_gensim_from_binary_file(w2v_model)
            self.model.filename = w2v_model.split('/')[-1]
            if not self.name:
                self.name = self.model.filename
        else:
            assert isinstance(w2v_model, Word2Vec)
            self.model = w2v_model
            if not self.name:
                if hasattr(self.model, 'filename'):
                    self.name = self.model.filename

        self.model.init_sims()  # normalize the vectors

        self.enormer = ElementNormalizer()

        if self.use_coke:
            bh = BackupHandler(relative_path('models/output/backup'))
            coke_file = 'coke_0329'
            if not bh.exists(coke_file):
                raise ValueError('Coke file does not exist: %s' % coke_file)
            self.coke = bh.load(coke_file)

        print 'Trying to load element indexes from cache ...'
        bh = BackupHandler(relative_path('models/output/backup'))
        elem_index_backup_name = self.model.filename + '_elem_index'
        if bh.exists(elem_index_backup_name):
            self.idfs, self.elems, self.elem_lookup, self.vecmat = bh.load(
                elem_index_backup_name)

        else:
            print 'Word2vecBaseline building element indexes...'

            fu, fau = get_fu_fau()
            self.idfs = self.get_idf(fu.values() + fau.values())

            self.elems = sorted(self.all_elem_counts.keys())
            self.elem_lookup = dict((y, x) for (x, y) in enumerate(self.elems))
            vecs = []
            for e in self.elems:
                u = doc_serve.get_training_doc(e, True)
                v = self.get_bow_representation(u)
                vecs.append(v)
            self.vecmat = np.array(vecs)
            assert self.vecmat.shape == (len(self.elems),
                                         self.model.vector_size)

            bh.save(elem_index_backup_name,
                    (self.idfs, self.elems, self.elem_lookup, self.vecmat))

            print 'Finished building indexes.'
예제 #16
0
    with open('../../models/output/mpl_code_blocks.txt') as reader:
        content = reader.read()

    content = content.decode('utf-8')
    content = content.replace("&lt;", "<")
    content = content.replace("&gt;", ">")
    content = content.replace("&amp;", "&")

    sompl_blocks = content.split(
        '\n\n\n')  # stackoverflow matplotlib code blocks
    print 'There are %d code examples from mpl stackoverflow' % len(
        sompl_blocks)

    # Step 2:
    bh = BackupHandler('.')
    cookbook_segs = bh.load('cookbook_segs')
    cookbook_blocks = []
    for tag, p in cookbook_segs:
        if tag == 'CODE':
            cookbook_blocks.append(p)

    print 'There are %d code examples from matplotlib cookbook' % len(
        cookbook_blocks)

    all_codes = sompl_blocks + cookbook_blocks

    print 'There are %d code blocks in total' % (len(all_codes))

    # Step 3:
    counters = {}
    counter_names = [
예제 #17
0
from codemend.models.extract_so_code import load_threads, Thread, Answer
from codemend import BackupHandler, relative_path

if __name__ == '__main__':
    bh_dir = relative_path('models/output/backup')
    bh = BackupHandler(bh_dir)

    try:
        threads = bh.load('mpl_threads')
    except AssertionError:
        threads = list(
            load_threads(
                qfilter=
                "Tags LIKE '%<matplotlib>%' AND AnswerCount > 0 AND Score >= 0",
                afilter="Score >= 0 ORDER BY Score DESC LIMIT 3"))
        bh.save('mpl_threads', threads)

    with open(relative_path('models/output/mpl_so_titles.txt'), 'w') as writer:
        for t in threads:
            writer.write('%d\t%s\n' % (t.qid, t.qtitle.encode('utf-8')))
예제 #18
0
  # Step 1:
  # Copied from annotate_code_with_api.py
  with open('../../models/output/mpl_code_blocks.txt') as reader:
    content = reader.read()

  content = content.decode('utf-8')
  content = content.replace("&lt;", "<")
  content = content.replace("&gt;", ">")
  content = content.replace("&amp;", "&")

  sompl_blocks = content.split('\n\n\n')  # stackoverflow matplotlib code blocks
  print 'There are %d code examples from mpl stackoverflow'%len(sompl_blocks)

  # Step 2:
  bh = BackupHandler('.')
  cookbook_segs = bh.load('cookbook_segs')
  cookbook_blocks = []
  for tag, p in cookbook_segs:
    if tag == 'CODE':
      cookbook_blocks.append(p)

  print 'There are %d code examples from matplotlib cookbook'%len(cookbook_blocks)

  all_codes = sompl_blocks + cookbook_blocks

  print 'There are %d code blocks in total'%(len(all_codes))

  # Step 3:
  counters = {}
  counter_names = ['syntax_errors', 'unsafes', 'timeouts', 'exec_errors',
                   'nofigures', 'savefig_errors', 'empty_svgs', 'successes']
예제 #19
0
from codemend import BackupHandler, relative_path
from codemend.models.element import ElementNormalizer
from codemend.models.word2vec_util import load_gensim_from_binary_file
from codemend.models.bimodal2 import BiModal
from codemend.experimental.code_suggest.mine_element import code_examples

if __name__ == '__main__':
  bh = BackupHandler(relative_path('experimental/code_suggest/output/backup'))
  elem_counts = bh.load('elem_pyplot_counts_0404')
  all_elems = sorted(elem_counts.keys())
  all_elems_counts = elem_counts
  enormer = ElementNormalizer()
  w2v_model = load_gensim_from_binary_file(
    relative_path('models/output/vectors-so-text-python-lemma-win5.bin'))  # <-- note the change here!!

  model = BiModal(all_elems, all_elems_counts, w2v_model, code_examples, enormer,
                  threads=None, alpha=0.05, window=5, negative=20,
                  additive=0, multiply=0, concat=1,
                  epoch=1, rand_parent_doc=True,
                  hint_pvecs_init=True, hint_rvecs_init=False,
                  neg_sample_used_elem=False)

  model.save(relative_path('models/output/bi2-0410-t.model'))

  # Changes:
  # bi2-test -- lastest gold version for user study
  # bi2-0410-a -- epoch=10, fixed stopwords (e.g., excluding bar from stopwords) -- this is vanilla
  # bi2-0410-b -- epoch=1, quick check if setting is all right.
  # bi2-0410-c -- epoch=10, replicating bi2-0410-a
  # bi2-0410-d -- epoch=1, randomly with-parent doc
  # bi2-0410-e -- epoch=5, randomly with-parent doc
예제 #20
0
The table is like this:
  (func_id, code, svg)

There are at most 20 (shortest) examples per func_id.

"""
import sqlite3

from codemend import BackupHandler, relative_path

if __name__ == '__main__':

  print 'Reading SVGs and code examples. Takes 7.3 seconds...'
  bh = BackupHandler('.')
  svgs = bh.load('svgs')
  all_codes = bh.load('all_codes')
  plotcommands_examples = bh.load('plotcommands_examples')  # [plot_command] = [example_idx]

  db = sqlite3.connect(relative_path('demo/data/code.sqlite3'))
  cursor = db.cursor()

  cursor.executescript("""
    DROP TABLE IF EXISTS example;

    CREATE TABLE example (
      func_id TEXT NOT NULL,
      code TEXT NOT NULL,
      svg TEXT
    );