def index_code_snippet(writer):
    HOME = "/Users/Raphael/Downloads/GitArchive"  #29.06.2015, 03.07.2015, 15.07.2015
    jfiles = java_files_from_dir(HOME)

    N_cores = 4

    # print("Number of Java files to process: %s" % (len(jfiles)))
    source_queue = []

    i = 0
    j = 0

    for jfile in jfiles:
        i += 1
        if i % 1000 == 0:
            print("Counter: %s" % i)
            break

        document = Document()
        document.add(Field("file", jfile, Field.Store.YES, Field.Index.NO))

        try:
            with codecs.open(jfile, "r", encoding='utf-8') as f:
                file_content = f.read().encode("utf-8")

            document.add(
                Field("file_content", compress(file_content), Field.Store.YES,
                      Field.Index.NO))
            # Check for duplicates files and accumulate source code
            # hash_v =  str(md5(file_content))
            # if hash_v not in hashes:
            # 	source_queue.append((document, file_content))
            # 	hashes.add(hash_v)

            # Wait until source files
            # if len(source_queue) >= N_cores:
            # 	ast_docs = parallize(source_queue)
            # 	source_queue = []

            # 	for ast, file_content, doc in ast_docs:
            ast = parse(file_content, resolve=False)
            if add_code_keyword_into_document(document, file_content, ast):
                writer.addDocument(document)
                j += 1
                if j % 1000 == 0:
                    print "Wrote:: %s files" % j

        except Exception as e:
            #traceback.print_exc()
            #print jfile
            print("Error: %s" % e)
            continue

    print "Number of files: %s" % i

    print "Number of duplicates: %s" % len(hashes)

    print("%s files has been indexed" % j)
Beispiel #2
0
 def __init__(self, source, index_path):
     self.index_path = index_path
     self.source = source
     ast, source = parse(self.source, resolve=True, source=True)
     self.source = source
     self.ast = ast
     self.queryparser = QueryParser(Version.LUCENE_CURRENT,
                                    "typed_method_call", KeywordAnalyzer())
     self.load_index()
Beispiel #3
0
    def more_like_this(self):

        trees = []

        file_hash_process = set()
        query = self.document_to_query()

        if query:
            print "-" * 30
            print "Query: %s" % query
            print "-" * 30
            try:
                like_query = self.queryparser.parse(query)

                hits = self.searcher.search(like_query, 10).scoreDocs

                for i, hit in enumerate(hits):
                    doc = self.searcher.doc(hit.doc)
                    matched_terms = self.get_matched_keywords(
                        like_query, hit.doc)
                    file_path = doc.getField("file").stringValue()
                    #print "Matched Terms", matched_terms
                    print "Path: ", file_path
                    # apis = [d.stringValue() for d in doc.getFields("typed_method_call")]
                    with open(file_path, "r") as f:
                        file_content = f.read()

                    file_hash = doc.getField("hash").stringValue()

                    #print "FILE", file_content
                    #print "PARSE", parse(file_content, resolve=False)
                    if file_hash not in file_hash_process:
                        trees.append(parse(file_content, resolve=False))
                        file_hash_process.add(file_hash)
                    else:
                        print "Duplicate: ", file_path

                    #trees.append( self.get_AST_from_Doc(doc) )

            except Exception as e:
                print "Error: %s" % e

        return trees
Beispiel #4
0
def transform_body(body):
	code_snippets = []
	code_hints = []
	for item in body.split("</code>"):
		if "<code>" in item:
			code_tag = item [item.find("<code>")+len("<code>"):]
			code_tag = utils.unescape_html(code_tag)
			if "." in code_tag and "(" in code_tag:
				code_snippets.append(code_tag)

				if "<pre" not in item and len(code_tag) < 25: # Heuristic to determine if code_tag is enclosed in inline code block
					code_hints.append(code_tag)
			elif len(code_tag) < 25:
				code_hints.append(code_tag)

	l = []
	for code_hint in code_hints:
		l.extend( utils.tokenize(code_hint) )

	code_hints = set(l)

	# parsers = [JDTParser(code_snippet, parse) for code_snippet in code_snippets]

	# futures = pool.invokeAll(parsers)

	# asts = [ future.get(3, TimeUnit.SECONDS).result for future in futures]

	#asts = [parse(code_snippet, resolve=False) for code_snippet in code_snippets]

	asts = []
	for code_snippet in code_snippets:
		ast = parse(code_snippet, resolve=True)
		if ast:
			asts.append(ast)
	

	return asts, code_hints
Beispiel #5
0
	def __init__(self, snippet, sources):
		self.snippet = parse(snippet, resolve=True)
		self.sources = sources

		self.class_PQN_to_FQN = defaultdict(list)
		self.method_PQN_to_FQN = defaultdict(lambda: { "fqn": [], "class": []})