Ejemplo n.º 1
0
    def import_from_url(self, author_id, url):
        """import_from_url is used to import memos from external sources in internet.

        Args:
            author_id: memo's author's id as ObjectId.
            url: source location where imported content is currently as string.

        Returns:
            Union(Memo, None): returns Memo if saved successfully, returns None if not or
                               content from website were empty.
        """
        try:
            imported = fetch_url(url)
            content = extract(imported,
                              include_comments=False,
                              include_tables=True,
                              output_format="txt")
            if not content:
                raise ValueError

            index = content.find('\n')
            while index != -1:
                content = content[:index] + "\n" + content[index:]
                index = content.find('\n', index + 2)
            url_i = (url.find("://"), url.find("/", url.find("://") + 3))
            title = "Imported from " + url[url_i[0] + 3:url_i[1]] if len(
                url[url_i[0] + 3:url_i[1]]) < 36 else url[url_i[0] + 3:36]
            return self.create(author_id, title, content)
        except ValueError:
            return None
Ejemplo n.º 2
0
def get_full_text_col(df: pd.DataFrame) -> None:
    """
    A method that will add a new column "full_text"
     in which the full text of the URL associated to the row will be there. It will also
     save the new DataFrame in a csv file.

    :param df: A DataFrame that contains a url column
    :return: None
    """
    full_text_list = []
    for url in tqdm(df['url']):
        try:
            content = trafilatura.fetch_url(url)
            full_text_list.append(
                trafilatura.extract(content,
                                    include_comments=False,
                                    include_tables=False,
                                    no_fallback=False))
        except Exception as e:
            print(e)
            full_text_list.append(np.nan)
        time.sleep(random.uniform(0.1, 1))
    print("Finished")
    df['full_text'] = full_text_list
    df.to_csv('../../data/raw/raw_data_facts_full_text.csv', index=False)
def multi_threading_scrapping(url_list):
    """
    The scrapping function that will iterate through all the list of given urls.

    Parameters:
    url_list (List): List of urls.

    Returns:
    Void function

    """
    for link in url_list:
        url = link.get('href')
        downloaded = trafilatura.fetch_url(url)

        # Checking the content of page.
        if trafilatura.extract(downloaded) != None:

            # Make a GET request to fetch the raw HTML content
            html_content = requests.get(url).text

            # Parse the html content
            soup = BeautifulSoup(html_content, "lxml")

            temp.append(url)

            # Complete the dictionary.
            my_dict['Url'].append(url)
            my_dict['Title'].append(soup.title.text)
            my_dict['Content'].append(trafilatura.extract(downloaded))
            my_dict['Score'].append(0)  # all scores are initialized at 0.
Ejemplo n.º 4
0
def GetDocContent(topic_id, uuid, index='cw12'):
    url = baseUrl + '/cache?uuid={}&index={}&raw&plain'.format(uuid, index)
    # g = requests.get(url) 5e733d53-43e8-58f0-abfe-fa7fc2538733
    source_file = trafilatura.fetch_url(url) # g.text

    if not source_file:
        print('Cannot retrieve document {}'.format(uuid))
        time.sleep(0.5)
        return ' ', ' '
        # return GetDocContent(topic_id, uuid, index)

    print('Document has been retrieved succesfully {}'.format(uuid))

    # Extract content using boilerpy3 and trafilatura, then combine results
    data_1 = trafilatura.extract(source_file)
    if data_1:
        data_1 = TAG_RE.sub('', data_1)
        doc_1 = nlp(data_1)
        sents_1 = [sent.text.strip().lower().replace('\n', ' ') for sent in doc_1.sents if len(sent.text) > 20]
    else:
        sents_1 = []

    data_2 = extractor.get_content(source_file)
    if data_2:
        data_2 = TAG_RE.sub('', data_2)
        doc_2 = nlp(data_2)
        sents_2 = [sent.text.strip().lower().replace('\n', ' ') for sent in doc_2.sents if len(sent.text) > 20]
    else:
        sents_2 = []

    final_data = list(set(sents_1) | set(sents_2))
    main_content = '\n'.join(final_data)

    return source_file, main_content
Ejemplo n.º 5
0
def scrape_page(url: str) -> None:
    """
    A method that will be scrape the articles of a given url and save it as a csv file.
    Particularly from 'https://www.bbc.com/mundo/search?q=<TERM_TO_SEARCH>'
    :param url: The url of the search engine
    :return: None
    """
    term_to_search = url.split('=')[-1]
    s = HTMLSession()
    # # First search
    articles_pages, next_page = get_articles_pages(url, s)
    idx = 1

    with open(f'../../data/raw/bbc_articles_{term_to_search}.csv',
              'w',
              newline='') as csvfile:
        fieldnames = [
            'id', 'url', 'author', 'date', 'description', 'sitename', 'title',
            'text', 'categoria'
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        while next_page:
            for page in articles_pages:
                print(f'url:', page)
                time.sleep(random.uniform(1, 2))
                try:
                    content = trafilatura.fetch_url(page)
                    article = trafilatura.metadata.extract_metadata(content)
                    article['text'] = trafilatura.extract(
                        content,
                        include_comments=False,
                        include_tables=False,
                        no_fallback=False)

                    writer.writerow({
                        'id': idx,
                        'url': article['url'],
                        'author': article['author'],
                        'date': article['date'],
                        'description': article['description'],
                        'sitename': article['sitename'],
                        'title': article['title'],
                        'text': article['text'],
                        'categoria': 'confiable'
                    })

                except Exception as e:
                    print("Failed to get content", e)
                idx += 1
            print('=' * 50)
            print("NEXT:", next_page)
            try:
                articles_pages, next_page = get_articles_pages(next_page, s)
            except Exception as e:
                print("Failed to get new search page", e)
            time.sleep(random.uniform(10, 15))
    print("Finished")
Ejemplo n.º 6
0
def url2sentences(url):  # 데이터를 크롤링하고 문장 단위로 나누어 주는 함수
    downloaded = trafilatura.fetch_url(url)
    result = trafilatura.extract(downloaded)
    web_doclist = result  # 크롤링한 데이터 받아오는 부분 (text density 적용)
    sentences = re.sub('[-=.#/?:$}]', '', web_doclist)  # 정규표현식으로 필요없는 문자 제거
    sentences = sentences.split()  # 문자열을 리스트로 변환
    for idx in range(0, len(sentences)):
        if len(sentences[idx]) <= 10:
            sentences[idx - 1] += (' ' + sentences[idx])
    sentences[idx] = ''
    return sentences
def get_page_text(url):
    downloaded = trafilatura.fetch_url(url=url)
    h = html2text.HTML2Text()
    h.ignore_links = True
    extracted_data = trafilatura.extract(downloaded)
    if extracted_data is not None:
        page_text_output = h.handle(extracted_data).replace('\n', ' ').replace('  ', ' ').strip()
        print('page_text_output len:', len(page_text_output))
        return h.handle(trafilatura.extract(downloaded)).replace('\n', ' ').replace('  ', ' ').strip()
    else:
        return ''
Ejemplo n.º 8
0
def readResults(urls, query):
    x = []  # Prepare the data frame to store results
    position = 0  # position on the serp
    for page in urls:  # Loop items in results
        downloaded = trafilatura.fetch_url(page)
        if downloaded is not None:  # assuming the download was successful
            result = trafilatura.extract(downloaded,
                                         include_tables=False,
                                         include_formatting=False,
                                         include_comments=False)
            x.append(result)
            return x
Ejemplo n.º 9
0
    def post(self):
        content = request.json
        logging.debug(content)

        language = content['language']
        show_explanations = 'explain' in content and content['explain']
        show_highlights = 'highlights' in content and content['highlights']
        disallowed_rels = content['disallowed_rels'] if 'disallowed_rels' in content else args.disallowed_rels.split(';')

        if 'uri' in content:
            downloaded = trafilatura.fetch_url(content['uri'])
            if downloaded is None:
                return jsonify({ "error": "Could not fetch URL" })
            text = trafilatura.extract(downloaded)
        elif 'text' in content:
            text = content['text']

        # Process text with labels...
        labels = content['labels']

        # Process text with labels
        response = predict(text, labels, language, disallowed_rels, show_explanations, show_highlights)

        ### response looks like this ###
        ### (both the labels and the paths are sorted by score ###
        # [{'label': 'space',
        #   'score': 1.2366091944277287,
        #   'terms': [{'paths': [['space', 'label']], 'score': 1.0},
        #    {'paths': [['star', 'locatedat', 'space']], 'score': 0.18517242},
        #    {'paths': [['love', 'isa', 'television_show'],
        #      ['television_show', 'isa', 'space']],
        #     'score': 0.05143677}]},
        #  {'label': 'technology',
        #   'score': 0.1451974897645414,
        #   'terms': [{'paths': [['space', 'relatedto', 'science_fiction'],
        #      ['science_fiction', 'relatedto', 'technology']],
        #     'score': 0.14295651},
        #    {'paths': [['love', 'relatedto', 'technophilia'],
        #      ['technophilia', 'relatedto', 'technology']],
        #     'score': 0.0022409796}]},
        #  {'label': 'medicine',
        #   'score': 0.05455923452973366,
        #   'terms': [{'paths': [['space', 'relatedto', 'science'],
        #      ['science', 'relatedto', 'medicine']],
        #     'score': 0.054559235}]}]

        # Return the output as a JSON string
        return jsonify({
            "text": text,
            "labels": labels,
            "results": response
        })
Ejemplo n.º 10
0
def content_extract_with_t(url_list):
    # 포문 돌면서 리스트 하나씩 꺼내서 아래 내용 돌면서 본문 출력함

    for i in range(len(url_list)):
        print('URL :', url_list[i])
        downloaded = trafilatura.fetch_url(url_list[i])
        content = trafilatura.extract(downloaded)

        print(
            "*************************************************************************************************"
        )
        print(content)
        print(
            "*************************************************************************************************"
        )
Ejemplo n.º 11
0
def Sentiment(request):
    ajx = False
    if request.method == 'POST':
        if request.POST.get('text_up'):
            text = request.POST.get('text')
        elif request.POST.get('url_up'):
            url_up = request.POST.get('text')
            url = re.search("(?P<url>https?://[^\s]+)", url_up).group("url")
            downloaded = trafilatura.fetch_url(url)
            text = trafilatura.extract(downloaded)

        elif request.POST.get('upld'):
            text = upload(request.FILES['file'])

        if request.is_ajax():
            text = request.POST.get('text')
            ajx = True
        # text = request.POST.get('text')
        print(text)
        sid = SentimentIntensityAnalyzer()

        #message_text = '''It seems to me we are in the middle of no man's land with respect to the  following:  Opec production speculation, Mid east crisis and renewed  tensions, US elections and what looks like a slowing economy (?), and no real weather anywhere in the world. I think it would be most prudent to play  the markets from a very flat price position and try to day trade more aggressively. I have no intentions of outguessing Mr. Greenspan, the US. electorate, the Opec ministers and their new important roles, The Israeli and Palestinian leaders, and somewhat importantly, Mother Nature.  Given that, and that we cannot afford to lose any more money, and that Var seems to be a problem, let's be as flat as possible. I'm ok with spread risk  (not front to backs, but commodity spreads). The morning meetings are not inspiring, and I don't have a real feel for  everyone's passion with respect to the markets.  As such, I'd like to ask  John N. to run the morning meetings on Mon. and Wed.  Thanks. Jeff'''

        message = text

        # Calling the polarity_scores method on sid and passing in the message_text outputs a dictionary with negative, neutral, positive, and compound scores for the input text
        scores = sid.polarity_scores(message)

        # Here we loop through the keys contained in scores (pos, neu, neg, and compound scores) and print the key-value pairs on the screen
        d = {}
        for key in sorted(scores):
            print('{0}: {1}, '.format(key, scores[key]), end='')
            val = round(scores[key] * 100, 2)
            d.update({key: val})
        print(d)

        d.update({"flag": 1, "text": text})
        print(d)

        if ajx:
            return JsonResponse(d, status=200)
        else:
            return render(request, 'Sentiment.html', context=d)
    else:
        if ajx:
            return JsonResponse(None, status=200)
        else:
            return render(request, 'Sentiment.html', {"message": None})
Ejemplo n.º 12
0
def LangTranslate(request):
    language = list(LANGUAGES.values())
    ajx = False
    if request.method == 'POST':
        if request.POST.get('text_up'):
            text = request.POST.get('text')
        elif request.POST.get('url_up'):
            url_up = request.POST.get('text')
            url = re.search("(?P<url>https?://[^\s]+)", url_up).group("url")
            downloaded = trafilatura.fetch_url(url)
            text = trafilatura.extract(downloaded)

        elif request.POST.get('upld'):
            text = upload(request.FILES['file'])
        if request.is_ajax():
            text = request.POST.get('text')
            ajx = True

        inputLanguage = str(request.POST.get('in_lang')).lower()
        outputLanguage = str(request.POST.get('out_lang')).lower()
        dataToTranslate = text
        print(inputLanguage, outputLanguage)
        translator = Translator(from_lang=inputLanguage,
                                to_lang=outputLanguage)
        translation = translator.translate(dataToTranslate)
        if ajx:
            return JsonResponse(
                {
                    'translation': translation,
                    'language': language,
                    'text': text
                },
                status=200)
        else:
            return render(
                request, 'LangTranslate.html', {
                    'translation': translation,
                    'language': language,
                    'text': text,
                    'in_lang': inputLanguage,
                    'out_lang': outputLanguage
                })
    else:
        if ajx:
            return JsonResponse(None, status=200)
        else:
            return render(request, 'LangTranslate.html',
                          {'language': language})
Ejemplo n.º 13
0
def get_title_text_web(url):
    downloaded = trafilatura.fetch_url(url)
    if downloaded == None:
        title = 'Not working text'
        text = 'Not working title'
        check = 'fake'
        dictio = {'title': [title], 'text': [text], 'check': check}
        df = pd.DataFrame(dictio, columns=['title', 'text', 'check'])
        return df
    text = trafilatura.extract(downloaded)
    html = request.urlopen(url).read().decode('utf8')
    soup = BeautifulSoup(html, 'html.parser')
    title = soup.find('title').string
    dictio = {'title': [title], 'text': [text], 'check': True}
    df = pd.DataFrame(dictio, columns=['title', 'text', 'check'])
    return df
def Extract_Contents(clean_links):
    list2 = []
    for url in clean_links:
        downloaded = trafilatura.fetch_url(url)
        trafilatura.extract(downloaded)
        # outputs main content and comments as plain text ...
        list1 = trafilatura.extract(downloaded, include_comments=False)
        # outputs main content without comments as XML ...
        list2.append("\n")
        list2.append(
            "---------------------------------------------------------------------------------------------------------------------"
        )
        list2.append("\n")
        list2.append("Below contents are extracted from this url:")
        list2.append("\n")
        list2.append(url)
        list2.append("\n")
        list2.append(list1)
        list3 = ''.join(filter(None, list2))
    return list3
Ejemplo n.º 15
0
    def extract_url(self, url: str, url_id: str = None):
        try:
            from trafilatura import extract, fetch_url
        except:
            logger.error(
                "Trafilatura is not installed, install as follows: pip install trafilatura"
            )
            return []

        url_id = url_id or "{:02x}".format(mmh3.hash(url, signed=False))
        url_content = fetch_url(
            url=url,
            no_ssl=self.no_ssl,
        )
        extracted_dict = None
        if url_content is not None:
            extracted_data = extract(
                filecontent=url_content,
                record_id=url_id,
                no_fallback=self.no_fallback,
                output_format=self._output_format,
                include_comments=self.include_comments,
                include_tables=self.include_tables,
                include_images=self.include_images,
                include_formatting=self.include_formatting,
                include_links=self.include_links,
                deduplicate=self.deduplicate,
                url_blacklist=self.url_blacklist,
                target_language=self.target_language,
            )

            if extracted_data:
                extracted_dict = json.loads(extracted_data)
                if "raw-text" in extracted_dict:
                    del extracted_dict["raw-text"]

        return extracted_dict
Ejemplo n.º 16
0
Archivo: app.py Proyecto: deivy311/HRI2
def get_more_info():
    post_request = request.get_json(force=True) # Get data posted as a json
    id = int(post_request['patient_id'])
    if not id:
        resp = flask.jsonify("No ID provided")
        resp.headers.add('Access-Control-Allow-Origin', '*')
        return resp
    
    patients_info = pd.read_csv("patient_info.csv")
    patient_info = patients_info.loc[patients_info['id'] == id]
    out = patient_info.to_dict(orient = "records")[0]

    #tosearch="treatment "+out['diagnosis']+" medicinenet.com"
    query_D=out['diagnosis']+" medicinenet.com"
    res=search(query_D, tld="co.in", num=15, stop=15, pause=2)
    buttoms={}
    for j in res: 
        downloaded = trafilatura.fetch_url(j)
        res=trafilatura.extract(downloaded,include_comments=False,include_tables=False, target_language='en')
        temp_buttoms=res[1]
        temp_buttoms.update(res[2])
        if (bool(temp_buttoms)):
            buttoms.update(temp_buttoms)
            #print(j)
        if len(buttoms)>=4:
            print("got it! ")
            break
    
    if not bool(buttoms):
        resp = flask.jsonify("No results!")
        resp.headers.add('Access-Control-Allow-Origin', '*')
        return resp
    # buttoms=out
    resp = flask.jsonify(buttoms)
    resp.headers.add('Access-Control-Allow-Origin', '*')
    return resp
Ejemplo n.º 17
0
def extractcontents(urls):
    raw_contents = []
    for i in range(len(urls)):
        d = trafilatura.fetch_url(urls[i])
        raw_contents.append(trafilatura.extract(d, include_comments=False))
    return raw_contents
Ejemplo n.º 18
0
def web_to_text(url):
    downloaded = trafilatura.fetch_url(url)
    return trafilatura.extract(downloaded)
Ejemplo n.º 19
0
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import trafilatura
array_links = ["http://www.commonlii.org/lk/cases/LKCA/1872/1.html"]
array_text = []
for l in array_links:
    html = trafilatura.fetch_url(l)
    text = trafilatura.extract(html)
    text_clean = text.replace("\n", " ").replace("\'", "")
    array_text.append(text_clean[0:5000])
from pytopicrank import TopicRank
for j in range(len(array_text)):
    tr = TopicRank(array_text[j])
    print("Keywords of article", str(j + 1), "\n",
          tr.get_top_n(n=5, extract_strategy='first'))
Ejemplo n.º 20
0
def Summary(request):
    ajx = False
    if request.method == 'POST':
        if request.POST.get('text_up'):
            text = request.POST.get('text')
        elif request.POST.get('url_up'):
            url_up = request.POST.get('text')
            url = re.search("(?P<url>https?://[^\s]+)", url_up).group("url")
            downloaded = trafilatura.fetch_url(url)
            text = trafilatura.extract(downloaded)

        elif request.POST.get('upld'):
            text = upload(request.FILES['file'])

        top_n = 0

        if request.is_ajax():
            text = request.POST.get('text')
            count = len(nltk.tokenize.sent_tokenize(text))
            if count == 1:
                return JsonResponse({'output': 'Text is already summarized.'},
                                    status=200)
            top_n = int(count * 0.6)
            ajx = True
        else:
            count = len(nltk.tokenize.sent_tokenize(text))
            top_n = int(request.POST.get('n'))
            if top_n == count:
                return render(request, 'Summary.html', {
                    'output': text,
                    'text': text
                })
            elif top_n > count:
                return render(
                    request, 'Summary.html', {
                        'output':
                        'Please enter appropriate value for frequency count!!',
                        'text': text
                    })
            elif count == 1:
                return render(request, 'Summary.html', {
                    'output': 'Text is already summarized.',
                    'text': text
                })
        nltk.download("stopwords")
        stop_words = stopwords.words('english')
        summarize_text = []

        # Step 1 - Read text anc split it
        data = text
        sentences = split_sentence(data)
        # Step 2 - Generate Similary Martix across sentences
        sentence_similarity_martix = build_similarity_matrix(
            sentences, stop_words)

        # Step 3 - Rank sentences in similarity martix
        sentence_similarity_graph = nx.from_numpy_array(
            sentence_similarity_martix)
        scores = nx.pagerank(sentence_similarity_graph)

        # Step 4 - Sort the rank and pick top sentences
        ranked_sentence = sorted(
            ((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
        #print("Indexes of top ranked_sentence order are ", ranked_sentence)

        for i in range(top_n):
            summarize_text.append(" ".join(ranked_sentence[i][1]))

        # Step 5 - Offcourse, output the summarize texr
        print("Summarize Text: \n", ". ".join(summarize_text))
        summary = ". ".join(summarize_text)
        if ajx:
            return JsonResponse({'output': summary}, status=200)
        else:
            return render(request, 'Summary.html', {
                'output': summary,
                'text': text,
                'n': top_n
            })
    else:
        if ajx:
            return JsonResponse({'output': None}, status=200)
        else:
            return render(request, 'Summary.html', context=None)
Ejemplo n.º 21
0
def Optimize(request):
    if request.method == 'POST':
        text = ''
        if request.POST.get('text_up'):
            text = request.POST.get('text')
        elif request.POST.get('url_up'):
            url_up = request.POST.get('text')
            url = re.search("(?P<url>https?://[^\s]+)", url_up).group("url")
            downloaded = trafilatura.fetch_url(url)
            text = trafilatura.extract(downloaded)
        elif request.POST.get('upld'):
            text = upload(request.FILES['file'])

        #FREQUENT WORDS
        # text = "Hello my name is Apoorva. My name is also popo. My home is in Indore/Sagar."
        tool = language_tool_python.LanguageTool('en-US')
        # text=request.POST.get('text')
        t1 = text
        matches = tool.check(t1)
        #print(matches)

        my_mistakes = []
        my_corrections = []
        start_positions = []
        end_positions = []

        for rules in matches:
            if len(rules.replacements) > 0:
                start_positions.append(rules.offset)
                end_positions.append(rules.errorLength + rules.offset)
                my_mistakes.append(text[rules.offset:rules.errorLength +
                                        rules.offset])
                my_corrections.append(rules.replacements[0])

        my_new_text = list(t1)

        for m in range(len(start_positions)):
            for i in range(len(text)):
                my_new_text[start_positions[m]] = my_corrections[m]
                if (i > start_positions[m] and i < end_positions[m]):
                    my_new_text[i] = ""

        my_new_text = "".join(my_new_text)

        print(my_new_text)

        #FREQUENT WORDS
        stop_words = set(
            stopwords.words('english'))  #for removing is, an, in, etc
        tokenizer = nltk.RegexpTokenizer(r"\w+")  #removing punctuatuons.
        words = tokenizer.tokenize(t1)

        st = WordNetLemmatizer()
        filtered_sentence = [
            w for w in words if not w in stop_words and len(w) != 1
        ]
        lemmas = set(st.lemmatize(fs) for fs in filtered_sentence)
        n = int(len(lemmas) * 0.3)
        frequency = Counter(lemmas)
        print(frequency)

        n_items = dict(
            sorted(frequency.items(), key=operator.itemgetter(1),
                   reverse=True)[:n])
        #SYNONYMS
        synonyms = {}

        for f in n_items:
            for syn in wordnet.synsets(f):
                s = set()
                if len(syn.lemmas()) == 1:
                    continue
                for l in syn.lemmas():
                    s.add(l.name())
                synonyms[f] = ','.join(list(s))

        print(synonyms)
        return render(request, 'Optimize.html', {
            'optimized': my_new_text,
            "fl_1": 1,
            "text": text,
            'words': synonyms
        })
    else:
        return render(request, 'Optimize.html', context=None)
import trafilatura
import time
from tabulate import tabulate
from lxml import html
from lxml import etree
from trafilatura.xml import validate_tei

downloaded=trafilatura.fetch_url('https://github.blog/2019-03-29-leader-spotlight-erin-spiceland/')
if not downloaded is None: 
    print ("Download successful.")
    
# Programm mit default-Werten fuer die keyword-Argumente ausfuehren und Zeit messen:
start=time.time()
result=trafilatura.extract(downloaded)
end=time.time()
duration=end-start
#print(result)

# Programm mit den Werten fuer die keyword-Argumente ausfuehren, die die vermutlich schnellste
# Ausfuehrungszeit sicherstellen und Zeit messen:
start2=time.time()
result2=trafilatura.extract(downloaded, include_comments=False, include_tables=False, no_fallback=True)
end2=time.time()
duration2=end2-start2

# verschieden Einstellungen ausprobieren:
start3=time.time()
result3=trafilatura.extract(downloaded, include_tables=False, no_fallback=True)
end3=time.time()
duration3=end3-start3
Ejemplo n.º 23
0
def get_url_components(url: str) -> dict:
    content = trafilatura.fetch_url(url)
    return trafilatura.bare_extraction(content)
def main():
    Topic = st.text_input('Input the topic here and press ENTER:')

    #if len(Topic)>0:
    if st.sidebar.button("Extract URLs for the given topic"):
        with st.spinner("Extracting..."):
            links = scrape_google_all(Topic)
            clean_links = Extract_Ranked_urls(links)
            st.write("Below are the top URLs to extract content:")
            for x in clean_links:
                st.write(x)
    st.sidebar.markdown("*******************************")
    if st.sidebar.button("Download Contents from URLs"):
        #    if text is not None:
        with st.spinner("Downloading..."):
            links = scrape_google_all(Topic)
            clean_links = Extract_Ranked_urls(links)
            list3 = Extract_Contents(clean_links)
            #            data1 = para_correct(list3)
            data = [
                content.strip() for content in list3.splitlines() if content
            ]
            data1 = '\\n\n'.join(f"{row}\n" for row in data)
            doc = Document()
            doc.add_paragraph(data1)
            #            docx = Document(io.BytesIO(requests.get(doc).content))
            #            b64 = base64.b64encode(docx)  # some strings <-> bytes conversions necessary here
            #            href = f'<a href="data:file/docx;base64,{b64}">Download docx file</a>'
            #            st.markdown(href, unsafe_allow_html=True)
            #            doc.paragraph_format.space_after = Inches(1.0)
            try:
                doc.save(str(Topic) + ".docx")
            except:
                st.write("Oops!", sys.exc_info()[0], "occurred.")
        st.markdown("Download Complete")
    st.sidebar.markdown("*******************************")
    if st.sidebar.checkbox("View the Extracted Contents"):
        with st.spinner("Downloading the Contents..."):
            links = scrape_google_all(Topic)
            clean_links = Extract_Ranked_urls(links)
            list3 = Extract_Contents(clean_links)
            Extracted_Contents = View_Extracted_Contents(list3)
            data = [
                content.strip() for content in Extracted_Contents.splitlines()
                if content
            ]
            for x in data:
                st.write(x)
            list2 = []
            for url in clean_links:
                downloaded = trafilatura.fetch_url(url)
                trafilatura.extract(downloaded)
                # outputs main content and comments as plain text ...
                list1 = trafilatura.extract(downloaded, include_comments=False)
                st.write("***************************************")
                st.write(url)
                if list1 is None:
                    st.write("Contents not available")
                else:
                    st.write("Contents available")
                ua = UserAgent()
                response = requests.get(url, {"User-Agent": ua.random})

                st.write("Response Code: ", response.status_code)
Ejemplo n.º 25
0
    # if validators.domain("http://" + val):
    #     url = "http://" + val
    # elif validators.domain("http://www." + val):
    #    url = "http://www." + val

    # print(validators.domain("https://" + val))

    # Test
    print(url)

    if url != 0:
        worksheet.update('A' + row, url)

        # Size of page in characters
        worksheet = sh.worksheet("shell")  # By title
        downloaded = trafilatura.fetch_url(url)
        worksheet.update("B" + row, len(str(downloaded)))
#        print(downloaded)

#    print(downloaded)
    if url != 0 and downloaded:
        from readability.readability import Document
        from html2text import html2text

        readable_article = Document(downloaded).summary()

        raw = html2text(readable_article)
        print(raw)

        # Lexicon Count - number of words present in the text
        lexicon_count = textstat.lexicon_count(raw, removepunct=True)
Ejemplo n.º 26
0
        if x > i:
            i = x
    return i


if add_selectbox == "Text":
    body = st.text_area('Insert your text here, as clean as possible.')
    if st.button("Predict"):
        st.success(":crystal_ball: " + predictor.predict(body) +
                   " :crystal_ball:")
        st.success("With a probability of " +
                   "{:.1%}".format(get_prob(predictor.predict_proba(body))))
        #st.write("How did we get that result:")
        #st.success(eli5.formatters.text.format_as_text(eli5.explain_prediction(predictor, body)))

elif add_selectbox == "Url":
    body = st.text_input('Insert your url here')
    if st.button("Predict"):
        page = body
        downloaded = trafilatura.fetch_url(page)
        result = trafilatura.extract(downloaded,
                                     include_tables=False,
                                     include_formatting=False,
                                     include_comments=False)
        st.success(":crystal_ball: " + predictor.predict(result) +
                   " :crystal_ball:")
        st.success("With a probability of " +
                   "{:.1%}".format(get_prob(predictor.predict_proba(result))))
        #st.write("How did we get that result:")
        #st.success(eli5.formatters.text.format_as_text(eli5.explain_prediction(predictor, result)))
Ejemplo n.º 27
0
import trafilatura
import os 




app = pyttsx3.init()
rate = app.getProperty("rate")
app.setProperty("rate", rate-70)
voices = app.getProperty("voices")
app.setProperty("voice", voices[0].id)


try:
    if (str(sys.argv[1]) == "fetch" and str(sys.argv[2]) is not None):
        category_fetched = trafilatura.fetch_url("https://de.wikibooks.org/wiki/Regal:{}".format(sys.argv[2]))
        text = trafilatura.extract(category_fetched)
        print(text)
        user_input = input("Choose one field. Note! Words that contain whitespaces need to be written with an underscore('_'): ")
        url_fetched = trafilatura.fetch_url("https://de.wikibooks.org/wiki/{}".format(user_input))
        url_text = trafilatura.extract(url_fetched, with_metadata=False)
        print(url_text)
        subcategory_input = input("Choose one subcategory. Note! Words that contain whitespaces need to be written with an underscore('_'): ")
        subcategory_fetched = trafilatura.fetch_url("https://de.wikibooks.org/wiki/{}/_{}".format(user_input, subcategory_input))
        sub_fetched_alternative = trafilatura.fetch_url("https://de.wikibooks.org/wiki/{}:_{}".format(user_input, subcategory_input))
        subcategory_text = trafilatura.extract(subcategory_fetched)

        if (subcategory_text is None):
            subcategory_text = trafilatura.extract(sub_fetched_alternative)
            print(subcategory_text)
         
Ejemplo n.º 28
0
def main():
    """A Simple NLP app with Spacy-Streamlit"""
    st.title("Text processing app for biological scientific papers")
    menu = ["Home", "NER", "Summarization", "Zero shot learning"]
    choice = st.sidebar.selectbox("Menu", menu)
    if choice == "Home":

        link = '[GitHub page](https://github.com/fm1320/IC_NLP)'
        st.write(
            """This application was made as part of a postgradute program at Imeprial College London. The details about the traning of the models, data and the techniques can be found at my personal github page provided below."""
        )
        st.markdown(link, unsafe_allow_html=True)
        st.write(
            """<---- Choose and try out one of the NLP tasks available from the drop down menu on the left"""
        )
        st.markdown(
            "![Alt Text](https://upload.wikimedia.org/wikipedia/en/thumb/5/5f/Imperial_College_London_monotone_logo.jpg/320px-Imperial_College_London_monotone_logo.jpg)"
        )
        st.write(
            "*Text examples source: Garcia-Perez, I., Posma, J.M., Serrano-Contreras, J.I. et al. Identifying unknown metabolites using NMR-based metabolic profiling techniques. Nat Protoc 15, 2538–2567 (2020). https://doi.org/10.1038/s41596-020-0343-3"
        )

        #st.subheader("Tokenization")
        #raw_text = st.text_area("Your Text","Enter Text Here")
        #docx = nlp(raw_text)
        #if st.button("Tokenize"):
        #   spacy_streamlit.visualize_tokens(docx,attrs=['text','pos_','dep_','ent_type_'])

    elif choice == "NER":
        st.subheader("Named Entity Recognition")
        # Add a selectbox to the sidebar:
        sel = st.sidebar.selectbox("Which NER model would you like to use ?", [
            "SpaCy Bloom embedding DL", "Spacy core en default",
            "String/Regex matching"
        ])

        # if sel== "SciSpacy":
        #import scispacy
        # nlp = spacy.load("en_core_sci_sm")
        # elif sel=="DL small":
        # nlp = spacy.load('./BiA') #Location of directory of spacy model
        if sel == "SpaCy Bloom embedding DL":
            path = model_loader(
                "https://github.com/fm1320/IC_NLP/releases/download/V3/V3-20210203T001829Z-001.zip",
                "V3")
            nlp = spacy.load(path)
        elif sel == "Spacy core en default":
            import en_core_web_sm
            nlp = en_core_web_sm.load()
            st.write(
                "*This is an example of a default model with general entities. Choose one of the other two to see assay recognition."
            )
        elif sel == "String/Regex matching":
            #r_text = st.text_area("Enter text for entity recognition with Regex","Text here")
            r_text = st.text_area(
                "Enter text for entity recognition with Regex",
                "However, it is very challenging to elucidate the structure of all metabolites present in biofluid samples. The large number of unknown or unidentified metabolites with high dynamic concentration range, extensive chemical diversity and different physical properties poses a substantial analytical challenge. Metabolic profiling studies are often geared toward finding differences in the levels of metabolites that are statistically correlated with a clinical outcome, dietary intervention or toxic exposure when compared to a control group. The chemical assignment of this reduced panel of biologically relevant metabolites is possible using statistical spectroscopic tools9–11, two-dimensional (2D) NMR spectroscopic analysis12–14, separation and pre-concentration techniques11, various chromatographic and mass spectroscopy (MS)-based analytical platforms."
            )
            iz = finder(r_text, "")
            ######################################
            # '''
            # model_id = model_ids[model_desc]
            # ex_names, ex_map = load_examples(model_id)

            # st.title('Zero Shot Topic Classification')
            # sequence = st.text_area('Text', ex_map[example][0], key='sequence', height=height)
            # labels = st.text_input('Possible topics (separated by `,`)', ex_map[example][1], max_chars=1000)
            # multi_class = st.checkbox('Allow multiple correct topics', value=True)
            # hypothesis_template = "This text is about {}."
            # labels = list(set([x.strip() for x in labels.strip().split(',') if len(x.strip()) > 0]))
            # if len(labels) == 0 or len(sequence) == 0:
            # st.write('Enter some text and at least one possible topic to see predictions.')
            # return
            # if do_print_code:
            # st.markdown(CODE_DESC.format(model_id))

            # with st.spinner('Classifying...'):
            # top_topics, scores = get_most_likely(model_id, sequence, labels, hypothesis_template, multi_class, do_print_code)

            # plot_result(top_topics[::-1][-10:], scores[::-1][-10:])

            # if "socat" not in [p.name() for p in psutil.process_iter()]:
            # os.system('socat tcp-listen:8000,reuseaddr,fork tcp:localhost:8001 &')
            # '''
            ##########################################
        method = st.sidebar.selectbox(
            "Choose input method (recommended:text box)", ["Text box", "URL"])

        if method == "Text box" and sel != "String/Regex matching":
            raw_text = st.text_area(
                "Enter text for entity recognition",
                "However, it is very challenging to elucidate the structure of all metabolites present in biofluid samples. The large number of unknown or unidentified metabolites with high dynamic concentration range, extensive chemical diversity and different physical properties poses a substantial analytical challenge. Metabolic profiling studies are often geared toward finding differences in the levels of metabolites that are statistically correlated with a clinical outcome, dietary intervention or toxic exposure when compared to a control group. The chemical assignment of this reduced panel of biologically relevant metabolites is possible using statistical spectroscopic tools9–11, two-dimensional (2D) NMR spectroscopic analysis12–14, separation and pre-concentration techniques11, various chromatographic and mass spectroscopy (MS)-based analytical platforms."
            )
            docx = nlp(raw_text)
            spacy_streamlit.visualize_ner(docx,
                                          labels=nlp.get_pipe('ner').labels)

        if method == "URL" and sel != "String/Regex matching":
            user_input = st.text_input("Enter page URL of an HTML file")
            if user_input is not None:
                downloaded = trafilatura.fetch_url(user_input)
                raw_text = trafilatura.extract(downloaded)
                raw_text = str(raw_text)
                docx = nlp(raw_text)
                spacy_streamlit.visualize_ner(
                    docx, labels=nlp.get_pipe('ner').labels)

    elif choice == "Summarization":
        #Textbox for text user is entering
        st.subheader(
            "Enter the text you'd like to summarize (Here is an example that can be pasted in the text box!)"
        )
        raw_text = st.text_area('''
    For over three decades, NMR spectroscopy has been widely applied in metabolic profiling and phenotyping1,2,3. The technology allows for accurate high-throughput screening of thousands of metabolites (small molecular species <1 kDa) present in a biological sample4,5,6,7, such as urine, plasma, feces, saliva and multiple types of tissues, as well as food8 and plant extracts. NMR spectroscopy provides robust multi-metabolite fingerprints of hundreds of metabolites in many biofluids, many of which are listed in spectral databases, particularly for common biofluids in urine and blood.
    However, it is very challenging to elucidate the structure of all metabolites present in biofluid samples. The large number of unknown or unidentified metabolites with high dynamic concentration range, extensive chemical diversity and different physical properties poses a substantial analytical challenge. Metabolic profiling studies are often geared toward finding differences in the levels of metabolites that are statistically correlated with a clinical outcome, dietary intervention or toxic exposure when compared to a control group. The chemical assignment of this reduced panel of biologically relevant metabolites is possible using statistical spectroscopic tools9,10,11, two-dimensional (2D) NMR spectroscopic analysis12,13,14, separation and pre-concentration techniques11, various chromatographic and mass spectroscopy 
    (MS)-based analytical platforms15,16 and existing spectral databases. However, the structural elucidation of NMR resonances relating to unknown molecules remains a major bottleneck in metabolic profiling studies. As a result, many published NMR-based metabolic profiling studies still continue to include putatively identified metabolites and unknown features without providing unequivocal proof of assignment, or they simply label peaks as ‘unknown’, thereby potentially missing key mechanistic information.
    To avoid the problem of multiple entries for the same compound in databases under different names, a community-wide effort is underway to develop better, faster and more standardized metabolite identification strategies, such as implementing standard nomenclature for newly identified metabolites using the International Chemical Identifier (InChI)17. Sumner et al. proposed a four-level system18 for assigning a confidence level to newly identified metabolites in metabolic profiling studies: 1) positively identified compounds (with a name, a known structure, a CAS number or an InChI); 2) putatively annotated compounds using spectral similarity with databases but without chemical reference standard; 3) putatively identified chemicals within a compound class; and 4) unknown compounds. Wishart et al. proposed a further distinction for those metabolites: the ‘known unknowns’ and the ‘unknown unknowns’19.
    A ‘known unknown’ corresponds to a metabolite that has not yet been identified in the sample of interest but that has been previously described in a database or in the literature, whereas a truly new compound, an ‘unknown unknown’, has never been described or formally identified.
    Commercial packages, such as Bruker’s AMIX TM software, and open-source software20, such as COLMAR (http://spinportal.magnet.fsu.edu/), can help with identifying these ‘known unknowns’, and some of these software applications are capable of automatically or semi-automatically annotating a limited number of compounds in a biological sample. However, even with automated annotation, the software still requires manual revision and can be prone to inconsistent interpretation and assignment by different individuals19. Most software packages and databases do not support identification of ‘unknown unknowns’, although a few platforms, such as AMIX, include prediction software to aid the identification of new compounds.
    Open-access databases have been created for researchers to deposit information relating to newly identified compounds. Most of the available databases, such as the Human Metabolome Database (HMDB)21, the BioMagResBank (BMRB)22, PRIMe server23, COLMAR 1H(13C)-TOCCATA and Bruker-AMIX (http://www.bruker-biospin.com/amix.html), contain chemical shift values, relative intensity and peak shape information for 1H-NMR and often 13C-NMR data to support metabolite identification. However, all databases contain inherent errors, such as incorrect structures for the metabolites, incorrect names and incorrect assigments. This problem is compounded further by the effect that experimental conditions, such as the pH or ionic content of the sample, can have on the chemical shift of a metabolite.
    Some of these databases, such as HMDB, provide complementary information, including MS assignments, which can be useful for checking potential errors in assignments of NMR peaks. However, although there are resources available to aid assignment of candidate biomarkers, there is no panacea for accurate metabolite identification, and there remains a clear unmet need for improved strategies for metabolite identification and curation for NMR spectral profiling.  
      ''')  #text is stored in this variable
        summWords = summarize(raw_text)
        st.subheader("Summary")
        st.write(summWords)

    elif choice == "Zero shot learning":
        st.write(
            """Due to resource constraints, this demo is moved to the link below:"""
        )
        link = '[Zero shot learning for NER demo](https://colab.research.google.com/drive/1zKDbjLo9vyEuSRotSSVwFLyaA61o1ceG#scrollTo=hkfE6NRA0Dzy)'
        st.markdown(link, unsafe_allow_html=True)
        st.write(
            "*Thanks to Hugging face's wonderful model repository and inspired by Joe Davison (researcher at hugging face)"
        )
        hug = '[Hugging face](https://huggingface.co/)'
        st.markdown(hug, unsafe_allow_html=True)