Exemple #1
0
 def __init__(self):
     bh = BackupHandler(
         relative_path('experimental/code_suggest/output/backup'))
     elem_counts = bh.load('elem_pyplot_counts_0404')
     self.all_elems = set(elem_counts.keys())
     self.all_elem_counts = elem_counts
     self.enormer = ElementNormalizer()
Exemple #2
0
    def __init__(self):
        plot_commands = get_plot_commands()
        pyplot_fu = get_pyplot_fu()
        self.plot_commands = plot_commands
        self.nonplot_commands = [
            f for f in pyplot_fu.keys() if not f in plot_commands
        ]
        print 'CodeSuggest: extracted %d plot commands' % len(plot_commands)

        # Load all code examples of plotting commands from db into memory
        # These are generated by index_examples.py
        print 'CodeSuggest: Loading code examples and pregenerated SVGs...'
        db = sqlite3.connect(relative_path('demo/data/code.sqlite3'))
        cursor = db.cursor()
        cursor.execute("SELECT func_id, code, svg FROM example")
        code_example_lookup = {}  # [func_id] = [(code, svg)]
        count_code_examples = 0
        for func_id, code, svg in cursor.fetchall():
            count_code_examples += 1
            if not func_id in code_example_lookup:
                code_example_lookup[func_id] = []
            code_example_lookup[func_id].append({'code': code, 'svg': svg})
        # Sort it again
        for func_id in code_example_lookup:
            code_example_lookup[func_id] = sorted(
                code_example_lookup[func_id],
                key=lambda x: get_effective_code_len(x['code']))
        self.code_example_lookup = code_example_lookup
        print 'CodeSuggest: Loaded %d code examples (with svgs)...' % count_code_examples
        db.close()

        # Load element_index generated by experimental/code_suggest/mine_argvs.py
        # bh = BackupHandler(relative_path('demo/data'))
        # self.element_index = bh.load('element_index')
        # print 'Loaded element_index with %d keys'%len(self.element_index)

        # Load element value counts
        bh2 = BackupHandler(
            relative_path('experimental/code_suggest/output/backup'))
        self.elem_val_counts = bh2.load(
            'elem_pyplot_value_counts_0404')  # [elem][val] = count
        for elem_id in self.elem_val_counts:
            self.elem_val_counts[elem_id] = sorted(
                self.elem_val_counts[elem_id].items(), key=lambda x: -x[1])

        self.func_position_finder = FuncPositionFinder()
Exemple #3
0
  def __init__(self):
    plot_commands = get_plot_commands()
    pyplot_fu = get_pyplot_fu()
    self.plot_commands = plot_commands
    self.nonplot_commands = [f for f in pyplot_fu.keys() if not f in plot_commands]
    print 'CodeSuggest: extracted %d plot commands'%len(plot_commands)

    # Load all code examples of plotting commands from db into memory
    # These are generated by index_examples.py
    print 'CodeSuggest: Loading code examples and pregenerated SVGs...'
    db = sqlite3.connect(relative_path('demo/data/code.sqlite3'))
    cursor = db.cursor()
    cursor.execute("SELECT func_id, code, svg FROM example")
    code_example_lookup = {}  # [func_id] = [(code, svg)]
    count_code_examples = 0
    for func_id, code, svg in cursor.fetchall():
      count_code_examples += 1
      if not func_id in code_example_lookup:
        code_example_lookup[func_id] = []
      code_example_lookup[func_id].append({'code': code, 'svg':svg})
    # Sort it again
    for func_id in code_example_lookup:
      code_example_lookup[func_id] = sorted(
        code_example_lookup[func_id],
        key=lambda x:get_effective_code_len(x['code']))
    self.code_example_lookup = code_example_lookup
    print 'CodeSuggest: Loaded %d code examples (with svgs)...'%count_code_examples
    db.close()

    # Load element_index generated by experimental/code_suggest/mine_argvs.py
    # bh = BackupHandler(relative_path('demo/data'))
    # self.element_index = bh.load('element_index')
    # print 'Loaded element_index with %d keys'%len(self.element_index)

    # Load element value counts
    bh2 = BackupHandler(relative_path('experimental/code_suggest/output/backup'))
    self.elem_val_counts = bh2.load('elem_pyplot_value_counts_0404')  # [elem][val] = count
    for elem_id in self.elem_val_counts:
      self.elem_val_counts[elem_id] = sorted(
        self.elem_val_counts[elem_id].items(), key=lambda x:-x[1])

    self.func_position_finder = FuncPositionFinder()
Exemple #4
0
def code_examples():
  """
  Yield code examples.

  """

  global all_codes1, all_codes2, all_codes3

  # 15770 code examples mined from SO answers in threads that are tagged
  # "matplotlib".
  if not all_codes1:
    print 'Loading SO code examples...'
    bh1 = BackupHandler(relative_path('experimental/code_suggest'))
    all_codes1 = bh1.load('all_codes')
    print '%d examples from SO'%len(all_codes1)

  for code in all_codes1:
    yield code

  # print 'WARNING: mine_element.py ignoring all GitHub code examples...'
  # """
  if not all_codes2:
    # 8732 code examples (including 395 IPython Notebook files) mined from
    # GitHub repositories that contain "matplotlib".
    print 'Loading GitHub code examples...'
    bh2 = BackupHandler(relative_path('experimental/code_suggest/output/backup'))
    all_codes2 = bh2.load('all_codes_github_1k_repo_0322')
    print '%d examples from GitHub'%len(all_codes2)

  for code in all_codes2:
    yield code
  # """

  if not all_codes3:
    # 21993 code examples extracted by Shiyan from the Web
    print 'Loading Web code examples'
    bh3 = BackupHandler(relative_path('experimental/mining/output'))
    all_codes3 = bh3.load('codes_shiyan_0331_web')
    print '%d examples from Web Shiyan'%len(all_codes3)

  for code in all_codes3:
    yield code
Exemple #5
0
 def __init__(self):
   bh = BackupHandler(relative_path('experimental/code_suggest/output/backup'))
   elem_counts = bh.load('elem_pyplot_counts_0404')
   self.all_elems = set(elem_counts.keys())
   self.all_elem_counts = elem_counts
   self.enormer = ElementNormalizer()
Exemple #6
0
from codemend.models.extract_so_code import load_threads, Thread, Answer
from codemend import BackupHandler, relative_path

if __name__ == '__main__':
  bh_dir = relative_path('models/output/backup')
  bh = BackupHandler(bh_dir)

  try:
    threads = bh.load('mpl_threads')
  except AssertionError:
    threads = list(load_threads(
      qfilter="Tags LIKE '%<matplotlib>%' AND AnswerCount > 0 AND Score >= 0",
      afilter="Score >= 0 ORDER BY Score DESC LIMIT 3"))
    bh.save('mpl_threads', threads)

  with open(relative_path('models/output/mpl_so_titles.txt'), 'w') as writer:
    for t in threads:
      writer.write('%d\t%s\n'%(t.qid, t.qtitle.encode('utf-8')))
Exemple #7
0
  # Step 1:
  # Copied from annotate_code_with_api.py
  with open('../../models/output/mpl_code_blocks.txt') as reader:
    content = reader.read()

  content = content.decode('utf-8')
  content = content.replace("&lt;", "<")
  content = content.replace("&gt;", ">")
  content = content.replace("&amp;", "&")

  sompl_blocks = content.split('\n\n\n')  # stackoverflow matplotlib code blocks
  print 'There are %d code examples from mpl stackoverflow'%len(sompl_blocks)

  # Step 2:
  bh = BackupHandler('.')
  cookbook_segs = bh.load('cookbook_segs')
  cookbook_blocks = []
  for tag, p in cookbook_segs:
    if tag == 'CODE':
      cookbook_blocks.append(p)

  print 'There are %d code examples from matplotlib cookbook'%len(cookbook_blocks)

  all_codes = sompl_blocks + cookbook_blocks

  print 'There are %d code blocks in total'%(len(all_codes))

  # Step 3:
  counters = {}
  counter_names = ['syntax_errors', 'unsafes', 'timeouts', 'exec_errors',
Exemple #8
0
            elem_counts[f, a, v] += 1
    return is_useful


if __name__ == '__main__':
    counters = defaultdict(int)

    md5s = set()

    all_codes = []

    fu, _ = get_fu_fau()

    elem_counts = defaultdict(int)  # [elem] = count

    bh = BackupHandler(
        relative_path('experimental/code_suggest/output/backup'))

    for root, dirs, files in os.walk(
            relative_path('mining/output/github-matplotlib-repos')):

        if '.git' in root: continue

        for file_name in files:
            counters['count_file'] += 1

            if counters['count_file'] % 1000 == 0:
                print 'Processed %d files - Useful files: %d' % (
                    counters['count_file'], counters['count_useful_files'])

            file_path = os.path.join(root, file_name)
from codemend.models.extract_so_code import load_threads, Thread, Answer
from codemend import BackupHandler, relative_path

if __name__ == '__main__':
    bh_dir = relative_path('models/output/backup')
    bh = BackupHandler(bh_dir)

    try:
        threads = bh.load('mpl_threads')
    except AssertionError:
        threads = list(
            load_threads(
                qfilter=
                "Tags LIKE '%<matplotlib>%' AND AnswerCount > 0 AND Score >= 0",
                afilter="Score >= 0 ORDER BY Score DESC LIMIT 3"))
        bh.save('mpl_threads', threads)

    with open(relative_path('models/output/mpl_so_titles.txt'), 'w') as writer:
        for t in threads:
            writer.write('%d\t%s\n' % (t.qid, t.qtitle.encode('utf-8')))
example also has its corresponding generated SVGs.

The table is like this:
  (func_id, code, svg)

There are at most 20 (shortest) examples per func_id.

"""
import sqlite3

from codemend import BackupHandler, relative_path

if __name__ == '__main__':

    print 'Reading SVGs and code examples. Takes 7.3 seconds...'
    bh = BackupHandler('.')
    svgs = bh.load('svgs')
    all_codes = bh.load('all_codes')
    plotcommands_examples = bh.load(
        'plotcommands_examples')  # [plot_command] = [example_idx]

    db = sqlite3.connect(relative_path('demo/data/code.sqlite3'))
    cursor = db.cursor()

    cursor.executescript("""
    DROP TABLE IF EXISTS example;

    CREATE TABLE example (
      func_id TEXT NOT NULL,
      code TEXT NOT NULL,
      svg TEXT
Exemple #11
0
  called.
- #3: Not recommending elements that occur too infrequently.
- #4: When a function is not used before, and its argv is recommended, we strip
  the "@", and recommend the function first, followed by the argv. e.g.
  [pie@0, pie] => [pie, pie@0].

"""

from codemend import BackupHandler, relative_path
from codemend.demo.code_suggest import get_plot_commands
from codemend.models.baseline2 import SuggestItem

plot_commands = get_plot_commands()
plot_commands_set = set(plot_commands)

bh = BackupHandler(relative_path('experimental/code_suggest/output/backup'))
elem_counts = bh.load('elem_pyplot_counts_0404')


def prune(used_elems, suggest_elems):
    for elem in used_elems:
        assert isinstance(elem, basestring)
    for elem in suggest_elems:
        assert isinstance(elem, SuggestItem), type(elem)

    used_elems_set = set(used_elems)
    used_funcs = map(get_func_name, used_elems)
    used_funcs_set = set(used_funcs)
    has_used_plot_commands = any(
        map(lambda x: x in plot_commands_set, used_funcs))
Exemple #12
0
 def __init__(self):
   bh = BackupHandler(relative_path('demo/data'))
   self.pos_ave = bh.load('pos_ave')
   print 'FuncPositionFinder: loaded %d average positions for functions'%len(self.pos_ave)
Exemple #13
0
  def __init__(self, w2v_model, all_elem_counts, maxngram=1,
               name=None, use_lemma=True,
               heuristic=False, use_coke=False):
    """
    w2v_model can be a binary vectors file, or a loaded gensim model instance.

    """
    self.maxngram = maxngram
    self.name = name
    self.use_lemma = use_lemma
    assert isinstance(all_elem_counts, dict)
    self.all_elem_counts = all_elem_counts
    self.heuristic = heuristic
    self.use_coke = use_coke

    if isinstance(w2v_model, basestring):
      self.model = load_gensim_from_binary_file(w2v_model)
      self.model.filename = w2v_model.split('/')[-1]
      if not self.name:
        self.name = self.model.filename
    else:
      assert isinstance(w2v_model, Word2Vec)
      self.model = w2v_model
      if not self.name:
        if hasattr(self.model, 'filename'):
          self.name = self.model.filename


    self.model.init_sims()  # normalize the vectors

    self.enormer = ElementNormalizer()

    if self.use_coke:
      bh = BackupHandler(relative_path('models/output/backup'))
      coke_file = 'coke_0329'
      if not bh.exists(coke_file):
        raise ValueError('Coke file does not exist: %s'%coke_file)
      self.coke = bh.load(coke_file)

    print 'Trying to load element indexes from cache ...'
    bh = BackupHandler(relative_path('models/output/backup'))
    elem_index_backup_name = self.model.filename + '_elem_index'
    if bh.exists(elem_index_backup_name):
      self.idfs, self.elems, self.elem_lookup, self.vecmat = bh.load(elem_index_backup_name)

    else:
      print 'Word2vecBaseline building element indexes...'

      fu, fau = get_fu_fau()
      self.idfs = self.get_idf(fu.values() + fau.values())

      self.elems = sorted(self.all_elem_counts.keys())
      self.elem_lookup = dict((y,x) for (x,y) in enumerate(self.elems))
      vecs = []
      for e in self.elems:
        u = doc_serve.get_training_doc(e, True)
        v = self.get_bow_representation(u)
        vecs.append(v)
      self.vecmat = np.array(vecs)
      assert self.vecmat.shape == (len(self.elems), self.model.vector_size)

      bh.save(elem_index_backup_name, (self.idfs, self.elems, self.elem_lookup, self.vecmat))

      print 'Finished building indexes.'
            train_pairs.append((merged_utter, astunparse.unparse(call_node)))

    unique_train_pairs = list(set(train_pairs))

    print 'total_block', total_block
    print 'total_grammatical', total_grammatical
    print 'total_call_nodes', total_call_nodes
    print 'total_matched_funcs', total_matched_funcs, '(total train pairs)'
    print 'total_matched_args', total_matched_args
    print 'total_unique_train_pairs', len(unique_train_pairs)

    return unique_train_pairs


if __name__ == '__main__':
    bh = BackupHandler(relative_path('models/output/backup'))

    # Step 1
    fu, fau = get_fu_fau()

    # Step 2
    with open(relative_path('models/output/mpl_code_blocks.txt')) as reader:
        content = reader.read()

    content = content.decode('utf-8')
    content = content.replace("&lt;", "<")
    content = content.replace("&gt;", ">")
    content = content.replace("&amp;", "&")

    blocks = content.split('\n\n\n')
Exemple #15
0

def get_effective_code_len(code):
  """
  Number of characters in a code example. Not counting lines with "import"

  """
  lines = code.split('\n')
  lines = filter(lambda x: 'import' not in x.split(), lines)
  return len('\n'.join(lines))


if __name__ == '__main__':

  print 'Reading SVGs and code examples. Takes 7.3 seconds...'
  bh = BackupHandler('.')
  svgs = bh.load('svgs')
  all_codes = bh.load('all_codes')




  print 'Loading functions that are plotting commands'
  # Copied from code_suggest.py
  import csv
  import pattern.en
  # Load csv file of pyplot summary
  pyplot_fu = {}  # [func] = utter
  print 'CodeSuggest: Loading pyplot fu...'
  with open('../../docstring_parse/pyplot_fu.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile)
    def __init__(self,
                 w2v_model,
                 all_elem_counts,
                 maxngram=1,
                 name=None,
                 use_lemma=True,
                 heuristic=False,
                 use_coke=False):
        """
    w2v_model can be a binary vectors file, or a loaded gensim model instance.

    """
        self.maxngram = maxngram
        self.name = name
        self.use_lemma = use_lemma
        assert isinstance(all_elem_counts, dict)
        self.all_elem_counts = all_elem_counts
        self.heuristic = heuristic
        self.use_coke = use_coke

        if isinstance(w2v_model, basestring):
            self.model = load_gensim_from_binary_file(w2v_model)
            self.model.filename = w2v_model.split('/')[-1]
            if not self.name:
                self.name = self.model.filename
        else:
            assert isinstance(w2v_model, Word2Vec)
            self.model = w2v_model
            if not self.name:
                if hasattr(self.model, 'filename'):
                    self.name = self.model.filename

        self.model.init_sims()  # normalize the vectors

        self.enormer = ElementNormalizer()

        if self.use_coke:
            bh = BackupHandler(relative_path('models/output/backup'))
            coke_file = 'coke_0329'
            if not bh.exists(coke_file):
                raise ValueError('Coke file does not exist: %s' % coke_file)
            self.coke = bh.load(coke_file)

        print 'Trying to load element indexes from cache ...'
        bh = BackupHandler(relative_path('models/output/backup'))
        elem_index_backup_name = self.model.filename + '_elem_index'
        if bh.exists(elem_index_backup_name):
            self.idfs, self.elems, self.elem_lookup, self.vecmat = bh.load(
                elem_index_backup_name)

        else:
            print 'Word2vecBaseline building element indexes...'

            fu, fau = get_fu_fau()
            self.idfs = self.get_idf(fu.values() + fau.values())

            self.elems = sorted(self.all_elem_counts.keys())
            self.elem_lookup = dict((y, x) for (x, y) in enumerate(self.elems))
            vecs = []
            for e in self.elems:
                u = doc_serve.get_training_doc(e, True)
                v = self.get_bow_representation(u)
                vecs.append(v)
            self.vecmat = np.array(vecs)
            assert self.vecmat.shape == (len(self.elems),
                                         self.model.vector_size)

            bh.save(elem_index_backup_name,
                    (self.idfs, self.elems, self.elem_lookup, self.vecmat))

            print 'Finished building indexes.'
Exemple #17
0
    # Copied from annotate_code_with_api.py
    with open('../../models/output/mpl_code_blocks.txt') as reader:
        content = reader.read()

    content = content.decode('utf-8')
    content = content.replace("&lt;", "<")
    content = content.replace("&gt;", ">")
    content = content.replace("&amp;", "&")

    sompl_blocks = content.split(
        '\n\n\n')  # stackoverflow matplotlib code blocks
    print 'There are %d code examples from mpl stackoverflow' % len(
        sompl_blocks)

    # Step 2:
    bh = BackupHandler('.')
    cookbook_segs = bh.load('cookbook_segs')
    cookbook_blocks = []
    for tag, p in cookbook_segs:
        if tag == 'CODE':
            cookbook_blocks.append(p)

    print 'There are %d code examples from matplotlib cookbook' % len(
        cookbook_blocks)

    all_codes = sompl_blocks + cookbook_blocks

    print 'There are %d code blocks in total' % (len(all_codes))

    # Step 3:
    counters = {}
Exemple #18
0
      elif isinstance(v, ast.Name):
        kvs.append((k, v.id))
      elif isinstance(v, ast.Tuple) \
           or isinstance(v, ast.Dict) \
           or isinstance(v, ast.List):
        kvs.append((k, astunparse.unparse(v).strip()))
  return func_name, kvs


if __name__ == '__main__':

  # Step 1.
  fu, fau = get_fu_fau()

  # Step 2.
  bh = BackupHandler(relative_path('experimental/code_suggest'))
  all_codes = bh.load('all_codes')
  print 'There are %d code examples in total'%len(all_codes)

  # Step 3.
  f_counts = defaultdict(int)  # [f] = count
  fa_counts = defaultdict(int)  # [f,a] = count
  fav_counts = defaultdict(int)  # [f,a,v] = count
  for code in all_codes:
    try:
      node = ast.parse(code)
    except SyntaxError:
      continue
    calls = findCallNodes(node)

    for call in calls:
Exemple #19
0
def transform_and_filter(elem):
  """
  Cleaning is performed to reduce sparsity:
    - pylab.xxx --> plt.xxx (if the function exists in pyplot)
    - various add_subplot.xxx --> plt.gca.xxx (see stype.tsv)
    - only plt.* are kept

  Returns: cleaned elem or None

  """
  elem = enormer.simplify(elem)
  if elem.startswith('plt.'):
    return elem
  else:
    return None

if __name__ == '__main__':
  coke_counts = defaultdict(int)
  count = 0
  for code in code_examples():
    count += 1
    if count % 1000 == 0:
      print '%d ... unique_cokes=%d'%(count, len(coke_counts))

    for x, y in get_cokes(code):
      coke_counts[x,y] += 1

  bh = BackupHandler(relative_path('models/output/backup'))
  bh.save('coke_0329', coke_counts)
Exemple #20
0
 def __init__(self):
     bh = BackupHandler(relative_path('demo/data'))
     self.pos_ave = bh.load('pos_ave')
     print 'FuncPositionFinder: loaded %d average positions for functions' % len(
         self.pos_ave)
      train_pairs.append((merged_utter, astunparse.unparse(call_node)))

  unique_train_pairs = list(set(train_pairs))

  print 'total_block', total_block
  print 'total_grammatical', total_grammatical
  print 'total_call_nodes', total_call_nodes
  print 'total_matched_funcs', total_matched_funcs, '(total train pairs)'
  print 'total_matched_args', total_matched_args
  print 'total_unique_train_pairs', len(unique_train_pairs)

  return unique_train_pairs


if __name__ == '__main__':
  bh = BackupHandler(relative_path('models/output/backup'))

  # Step 1
  fu, fau = get_fu_fau()

  # Step 2
  with open(relative_path('models/output/mpl_code_blocks.txt')) as reader:
    content = reader.read()

  content = content.decode('utf-8')
  content = content.replace("&lt;", "<")
  content = content.replace("&gt;", ">")
  content = content.replace("&amp;", "&")

  blocks = content.split('\n\n\n')
Exemple #22
0
      elem_counts[f,a] += 1
      elem_counts[f,a,v] += 1
  return is_useful

if __name__ == '__main__':
  counters = defaultdict(int)

  md5s = set()

  all_codes = []

  fu, _ = get_fu_fau()

  elem_counts = defaultdict(int)  # [elem] = count

  bh = BackupHandler(relative_path('experimental/code_suggest/output/backup'))

  for root, dirs, files in os.walk(
      relative_path('mining/output/github-matplotlib-repos')):

    if '.git' in root: continue

    for file_name in files:
      counters['count_file'] += 1

      if counters['count_file'] % 1000 == 0:
        print 'Processed %d files - Useful files: %d'%(
          counters['count_file'], counters['count_useful_files'])

      file_path = os.path.join(root, file_name)
Exemple #23
0
            elif isinstance(v, ast.Name):
                kvs.append((k, v.id))
            elif isinstance(v, ast.Tuple) \
                 or isinstance(v, ast.Dict) \
                 or isinstance(v, ast.List):
                kvs.append((k, astunparse.unparse(v).strip()))
    return func_name, kvs


if __name__ == '__main__':

    # Step 1.
    fu, fau = get_fu_fau()

    # Step 2.
    bh = BackupHandler(relative_path('experimental/code_suggest'))
    all_codes = bh.load('all_codes')
    print 'There are %d code examples in total' % len(all_codes)

    # Step 3.
    f_counts = defaultdict(int)  # [f] = count
    fa_counts = defaultdict(int)  # [f,a] = count
    fav_counts = defaultdict(int)  # [f,a,v] = count
    for code in all_codes:
        try:
            node = ast.parse(code)
        except SyntaxError:
            continue
        calls = findCallNodes(node)

        for call in calls:
Exemple #24
0
   this code
3. take average per function

Output:
 - a dictionary: [function] = average_position
   average position: between 0 (beginning of code) and 1 (end of code).
"""

import ast
from collections import defaultdict

from codemend import BackupHandler, relative_path
from codemend.models.annotate_code_with_api import get_fu_fau, findCallNodes, extractCallComponents

fu, fau = get_fu_fau()
bh = BackupHandler(relative_path('experimental/code_suggest'))
all_codes = bh.load('all_codes')
print 'There are %d code examples in total'%len(all_codes)

pos_sum = defaultdict(float)  # [f] = sum
pos_cnt = defaultdict(int)  # [f] = count
for code in all_codes:
  try:
    node = ast.parse(code)
  except SyntaxError:
    continue
  calls = findCallNodes(node)
  called_funcs = [extractCallComponents(x)[0] for x in calls]
  called_funcs = filter(lambda x: x in fu, called_funcs)
  if len(calls) < 3:
    continue
Exemple #25
0
      if elem_id.startswith('plt.'):
        element_pyplot_counts[elem_id] += 1
        val = get_countable_value(e.val_node, varmap, enormer)
        if val: element_pyplot_value_counts[elem_id][val] += 1

  for elem_id in element_pyplot_value_counts:
    element_pyplot_value_counts[elem_id] = dict(element_pyplot_value_counts[elem_id])
  element_pyplot_value_counts = dict(element_pyplot_value_counts)

  print 'Processed %d code examples'%count
  print 'There are %d unique elements'%len(element_counts)
  print 'There are %d unique pyplot elements'%len(element_pyplot_counts)
  for k in counters:
    print '%s: %d'%(k, counters[k])

  bh = BackupHandler(relative_path('experimental/code_suggest/output/backup'))
  # Change logs:
  # - 0322: using raw format
  # - 0327: using Element, tracking return type and variable assignments and
  #   import aliases.
  # - 0404: fixed issue with dict as positional argument;
  #         added element_value_counts;
  #         added Shiyan's example.
  bh.save('elem_counts_0404', element_counts)
  bh.save('elem_pyplot_counts_0404', element_pyplot_counts)
  bh.save('elem_pyplot_value_counts_0404', element_pyplot_value_counts)

  """
  Log:

  # 0327
Exemple #26
0

def get_effective_code_len(code):
    """
  Number of characters in a code example. Not counting lines with "import"

  """
    lines = code.split('\n')
    lines = filter(lambda x: 'import' not in x.split(), lines)
    return len('\n'.join(lines))


if __name__ == '__main__':

    print 'Reading SVGs and code examples. Takes 7.3 seconds...'
    bh = BackupHandler('.')
    svgs = bh.load('svgs')
    all_codes = bh.load('all_codes')

    print 'Loading functions that are plotting commands'
    # Copied from code_suggest.py
    import csv
    import pattern.en
    # Load csv file of pyplot summary
    pyplot_fu = {}  # [func] = utter
    print 'CodeSuggest: Loading pyplot fu...'
    with open('../../docstring_parse/pyplot_fu.csv', 'rb') as csvfile:
        reader = csv.reader(csvfile)
        next(reader, None)  # skip the header
        for f, u in reader:
            if not u:
Exemple #27
0
def transform_and_filter(elem):
    """
  Cleaning is performed to reduce sparsity:
    - pylab.xxx --> plt.xxx (if the function exists in pyplot)
    - various add_subplot.xxx --> plt.gca.xxx (see stype.tsv)
    - only plt.* are kept

  Returns: cleaned elem or None

  """
    elem = enormer.simplify(elem)
    if elem.startswith('plt.'):
        return elem
    else:
        return None


if __name__ == '__main__':
    coke_counts = defaultdict(int)
    count = 0
    for code in code_examples():
        count += 1
        if count % 1000 == 0:
            print '%d ... unique_cokes=%d' % (count, len(coke_counts))

        for x, y in get_cokes(code):
            coke_counts[x, y] += 1

    bh = BackupHandler(relative_path('models/output/backup'))
    bh.save('coke_0329', coke_counts)
Exemple #28
0
from codemend import BackupHandler, relative_path
from codemend.models.element import ElementNormalizer
from codemend.models.word2vec_util import load_gensim_from_binary_file
from codemend.models.bimodal2 import BiModal
from codemend.experimental.code_suggest.mine_element import code_examples

if __name__ == '__main__':
  bh = BackupHandler(relative_path('experimental/code_suggest/output/backup'))
  elem_counts = bh.load('elem_pyplot_counts_0404')
  all_elems = sorted(elem_counts.keys())
  all_elems_counts = elem_counts
  enormer = ElementNormalizer()
  w2v_model = load_gensim_from_binary_file(
    relative_path('models/output/vectors-so-text-python-lemma-win5.bin'))  # <-- note the change here!!

  model = BiModal(all_elems, all_elems_counts, w2v_model, code_examples, enormer,
                  threads=None, alpha=0.05, window=5, negative=20,
                  additive=0, multiply=0, concat=1,
                  epoch=1, rand_parent_doc=True,
                  hint_pvecs_init=True, hint_rvecs_init=False,
                  neg_sample_used_elem=False)

  model.save(relative_path('models/output/bi2-0410-t.model'))

  # Changes:
  # bi2-test -- lastest gold version for user study
  # bi2-0410-a -- epoch=10, fixed stopwords (e.g., excluding bar from stopwords) -- this is vanilla
  # bi2-0410-b -- epoch=1, quick check if setting is all right.
  # bi2-0410-c -- epoch=10, replicating bi2-0410-a
  # bi2-0410-d -- epoch=1, randomly with-parent doc
  # bi2-0410-e -- epoch=5, randomly with-parent doc
Exemple #29
0
example also has its corresponding generated SVGs.

The table is like this:
  (func_id, code, svg)

There are at most 20 (shortest) examples per func_id.

"""
import sqlite3

from codemend import BackupHandler, relative_path

if __name__ == '__main__':

  print 'Reading SVGs and code examples. Takes 7.3 seconds...'
  bh = BackupHandler('.')
  svgs = bh.load('svgs')
  all_codes = bh.load('all_codes')
  plotcommands_examples = bh.load('plotcommands_examples')  # [plot_command] = [example_idx]

  db = sqlite3.connect(relative_path('demo/data/code.sqlite3'))
  cursor = db.cursor()

  cursor.executescript("""
    DROP TABLE IF EXISTS example;

    CREATE TABLE example (
      func_id TEXT NOT NULL,
      code TEXT NOT NULL,
      svg TEXT
    );