def get_text_content(pkg): desc = pkg.get('desc', '') keywords = pkg.get('keywords', []) readme = pkg.get('readme', '') or '' if type(readme) != str and type(readme) != unicode: # print 'No Readme Found' readme = '' readmeText = extractText(readme) # print 'readme', readmeText parsedKeywords = rk.run(readmeText) # print 'rake', parsedKeywords results = [] for kw in keywords: if len(kw) > 2: results.append((kw, 2.0)) for (kw, score) in parsedKeywords: if score > 3: results.append((kw, 1.2 * math.log(score, 4))) if 2 < len(desc) < 512: results.append((desc, 1.0)) return results
def get_text_content(pkg): desc = pkg.get('desc', '') keywords = pkg.get('keywords', []) readme = pkg.get('readme', '') or '' if type(readme) != str and type(readme) != unicode and not readme.startswith('ERROR'): debug('No Readme Found') readme = '' readmeText = extractText(readme) debug('readme', readmeText) parsedKeywords = rk.run(readmeText) debug('rake', parsedKeywords) parsedKeywords = [kw for kw in parsedKeywords if kw[1] > 3] results = [] for kw in keywords: if len(kw) > 2: results.append((kw, 2.0)) for (kw, score) in parsedKeywords: #results.append((kw, 1.5 * math.log(score, 4) / len(parsedKeywords) )) results.append((kw, 1.2 * math.log(score, 4) )) if 2 < len(desc) < 512: results.append((desc, 1.0)) return results
def add_to_db(pkg, es): keywords = pkg.get("keywords", []) readme = pkg.get("readme", "") if type(readme) != str and type(readme) != unicode: # print 'No Readme Found' readme = "" plaintextRM = extractText(readme) es.index( index="temp", doc_type="implementation", id=get_es_id(pkg), body={ "language": "JavaScript", "algorithm": [], "source": "npm", "description": pkg.get("desc", ""), "plaintext-readme": plaintextRM, "instruction": {"package": pkg["name"], "command": "npm install " + pkg["name"], "content": readme}, "popularity": compute_pkg_weight(pkg), }, )
def add_to_db(pkg, es): keywords = pkg.get('keywords', []) readme = pkg.get('readme', '') if type(readme) != str and type(readme) != unicode: # print 'No Readme Found' readme = '' plaintextRM = extractText(readme) es.index(index='temp', doc_type='implementation', id=get_es_id(pkg), body={ 'language': 'JavaScript', 'algorithm': [], 'source': 'npm', 'description': pkg.get('desc', ''), 'plaintext-readme': plaintextRM, 'instruction': { 'package': pkg['name'], 'command': 'npm install ' + pkg['name'], 'content': readme }, 'popularity': compute_pkg_weight(pkg) })