def index_code_snippet(writer): HOME = "/Users/Raphael/Downloads/GitArchive" #29.06.2015, 03.07.2015, 15.07.2015 jfiles = java_files_from_dir(HOME) N_cores = 4 # print("Number of Java files to process: %s" % (len(jfiles))) source_queue = [] i = 0 j = 0 for jfile in jfiles: i += 1 if i % 1000 == 0: print("Counter: %s" % i) break document = Document() document.add(Field("file", jfile, Field.Store.YES, Field.Index.NO)) try: with codecs.open(jfile, "r", encoding='utf-8') as f: file_content = f.read().encode("utf-8") document.add( Field("file_content", compress(file_content), Field.Store.YES, Field.Index.NO)) # Check for duplicates files and accumulate source code # hash_v = str(md5(file_content)) # if hash_v not in hashes: # source_queue.append((document, file_content)) # hashes.add(hash_v) # Wait until source files # if len(source_queue) >= N_cores: # ast_docs = parallize(source_queue) # source_queue = [] # for ast, file_content, doc in ast_docs: ast = parse(file_content, resolve=False) if add_code_keyword_into_document(document, file_content, ast): writer.addDocument(document) j += 1 if j % 1000 == 0: print "Wrote:: %s files" % j except Exception as e: #traceback.print_exc() #print jfile print("Error: %s" % e) continue print "Number of files: %s" % i print "Number of duplicates: %s" % len(hashes) print("%s files has been indexed" % j)
def __init__(self, source, index_path): self.index_path = index_path self.source = source ast, source = parse(self.source, resolve=True, source=True) self.source = source self.ast = ast self.queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call", KeywordAnalyzer()) self.load_index()
def more_like_this(self): trees = [] file_hash_process = set() query = self.document_to_query() if query: print "-" * 30 print "Query: %s" % query print "-" * 30 try: like_query = self.queryparser.parse(query) hits = self.searcher.search(like_query, 10).scoreDocs for i, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) matched_terms = self.get_matched_keywords( like_query, hit.doc) file_path = doc.getField("file").stringValue() #print "Matched Terms", matched_terms print "Path: ", file_path # apis = [d.stringValue() for d in doc.getFields("typed_method_call")] with open(file_path, "r") as f: file_content = f.read() file_hash = doc.getField("hash").stringValue() #print "FILE", file_content #print "PARSE", parse(file_content, resolve=False) if file_hash not in file_hash_process: trees.append(parse(file_content, resolve=False)) file_hash_process.add(file_hash) else: print "Duplicate: ", file_path #trees.append( self.get_AST_from_Doc(doc) ) except Exception as e: print "Error: %s" % e return trees
def transform_body(body): code_snippets = [] code_hints = [] for item in body.split("</code>"): if "<code>" in item: code_tag = item [item.find("<code>")+len("<code>"):] code_tag = utils.unescape_html(code_tag) if "." in code_tag and "(" in code_tag: code_snippets.append(code_tag) if "<pre" not in item and len(code_tag) < 25: # Heuristic to determine if code_tag is enclosed in inline code block code_hints.append(code_tag) elif len(code_tag) < 25: code_hints.append(code_tag) l = [] for code_hint in code_hints: l.extend( utils.tokenize(code_hint) ) code_hints = set(l) # parsers = [JDTParser(code_snippet, parse) for code_snippet in code_snippets] # futures = pool.invokeAll(parsers) # asts = [ future.get(3, TimeUnit.SECONDS).result for future in futures] #asts = [parse(code_snippet, resolve=False) for code_snippet in code_snippets] asts = [] for code_snippet in code_snippets: ast = parse(code_snippet, resolve=True) if ast: asts.append(ast) return asts, code_hints
def __init__(self, snippet, sources): self.snippet = parse(snippet, resolve=True) self.sources = sources self.class_PQN_to_FQN = defaultdict(list) self.method_PQN_to_FQN = defaultdict(lambda: { "fqn": [], "class": []})