def get_collocations_alchemy(self): """Extract collocations using the self.cleaned_data dictionary""" full_name = handle_uploaded_file(self.cleaned_data['article_pdf']) if not os.path.exists(full_name + "x.xml"): subprocess.call([settings.PDFX_PATH, full_name]) extracted_data = parse_pdfx_xml(full_name + "x.xml") full_text = nlp.get_full_text(extracted_data)['text'] payload = {'apikey': 'd0604109bbeb676474b243bc623a0fc1a172437f', 'outputMode': 'json', 'maxRetrieve': '100', 'text': full_text} response = requests.post('http://access.alchemyapi.com/calls/text/TextGetRankedNamedEntities', data=payload) result = response.json() labels = [] for entity in result['entities']: labels.append((entity['text'] + ' (' + entity['type'] + ')', entity['relevance'])) return labels
def get_collocations(self): """Extract collocations using the self.cleaned_data dictionary""" from axel.stats.models import Collocations full_name = handle_uploaded_file(self.cleaned_data['article_pdf']) #stem_func = getattr(Stemmer, self.cleaned_data['stem_func']) if not os.path.exists(full_name + "x.xml"): subprocess.call([settings.PDFX_PATH, full_name]) extracted_data = parse_pdfx_xml(full_name + "x.xml") full_text = nlp.get_full_text(extracted_data)['text'] article = PDFUploadForm.generate_temp_article(full_text) labels = [] try: features = PDFUploadForm.build_features(article) for ngram, feature in features: klass0 = self.CLF.predict_proba(feature)[0][1] labels.append((ngram, klass0)) finally: article.delete() Collocations.objects.filter(count=0).delete() labels.sort(key=lambda x: x[1], reverse=True) return labels