Exemple #1
0
from entity_api import entity_extract
from untitled1 import Article, db, Keyword

# article = Article.query.filter_by(id=92).first()
# print article.category
# article.category = 'Musics'
# db.session.commit()

#data = Article.query.all()
#for article in data:
#    print article.category

#entity_extract(1, "The East India Company in the 1700s conjures pictures of British colonisation. What originally started as trade and business eventually led to 200 years of British Raj for our country.\nAt its peak, the company accounted for about half the global trade specialising in commodities like cotton, indigo, tea and opium, and offering employment to a third of the British workforce.\nAfter the 1857 mutiny in India, all its powers were transferred to the British Crown, and eventually, by 1874, the company was dissolved.\nForwarding into over a century later, in 2005, Mumbai-born entrepreneur, Sanjiv Mehta, with a \u201csense of redemption\u201d, bought the company from its 30 odd owners, and turned it into a luxury food brand.\nThe company now specialises in selling gourmet coffees, chocolates, rare teas, and other luxury-food items through its e-commerce website.\nThe first store was launched in the Mayfair neighborhood in London. Today, the company has escalated commercially and now runs stores across the UK, the Middle East, Europe and Asia, in addition to a successful e-commerce website.\nOn inaugurating the company, Mehta received congratulatory e-mails from thousands of Indians.\n\u201cIt is a dream come true to build a business like this and to acquire a brand like this to own the company,\u201d he said.\u201d\n\nParticipate in this discussion")
# key = Keyword.query.filter_by(id=2).first()
# print key.key_name
for i in range(560, 1290):
    article = Article.query.filter_by(id=i).first()
    if article is None:
        continue
    entity_extract(article.id, article.full_story, 1)
    print "done"
print "all done"
        simple_text = bsObj.find("div",attrs={"class":"ys_post_content text"}).get_text()

        # category
        category = "YourStory"

        # print title
        # print image
        # print description
        # print full_story
        # print simple_text
        # print category
        # print date

        if not db.session.query(Article).filter(Article.title == title).count():
                    article_a = Article(title=title, full_story=simple_text, image=image, category=category,
                                        description=description, pubdate=date, html=full_story)
                    db.session.add(article_a)
                    db.session.commit()
                    print article_a.id
                    entity_extract(article_a.id, simple_text, 1)

    except psycopg2.ProgrammingError:  # as ie:
                # print ie
                #print"Caught"
                pass
                #db.session.rollback()
                # break
                # continue

    except Exception as e:
        print(e)
Exemple #3
0
def entity(Id, data, news):
    entity_extract(Id, data, news)
Exemple #4
0
        # print image
        # print description
        # print full_story
        # print simple_text
        # print category
        # print date

        if not db.session.query(Article).filter(
                Article.title == title).count():
            article_a = Article(title=title,
                                full_story=simple_text,
                                image=image,
                                category=category,
                                description=description,
                                pubdate=date,
                                html=full_story)
            db.session.add(article_a)
            db.session.commit()
            print article_a.id
            entity_extract(article_a.id, simple_text, 1)

    except psycopg2.ProgrammingError:  # as ie:
        # print ie
        #print"Caught"
        pass
        #db.session.rollback()
        # break
        # continue

    except Exception as e:
        print(e)
Exemple #5
0
                # print title
                # print image
                # print description
                # print link
                # print date
                # print full_story
                # print category
                # print "\n\n"

                if not db.session.query(Article).filter(Article.title == title).count():
                    article_a = Article(title=title, full_story=full_story, image=image, category=category,
                                        description=description, pubdate=date)
                    db.session.add(article_a)
                    db.session.commit()
                    print article_a.id
                    entity_extract(article_a.id, full_story, 1)

        except psycopg2.IntegrityError:  # as ie:
                # print ie
                print"Caught"
                db.session.rollback()
                # break
                # continue

        except Exception as e:
            print e
            pass
except Exception as e:
    print e
    pass
Exemple #6
0
def upload():
    toi_rss={'http://timesofindia.indiatimes.com/rssfeedstopstories.cms': 'Top stories',
             'http://timesofindia.indiatimes.com/rssfeeds/1221656.cms': 'Most Recent',
             'http://timesofindia.feedsportal.com/c/33039/f/533916/index.rss': 'India',
             'http://timesofindia.feedsportal.com/c/33039/f/533917/index.rss': 'World',
             'http://timesofindia.feedsportal.com/c/33039/f/533919/index.rss':'Business',
             'http://timesofindia.feedsportal.com/c/33039/f/533920/index.rss':'Cricket',
             'http://timesofindia.feedsportal.com/c/33039/f/533921/index.rss':'Sports',
             'http://dynamic.feedsportal.com/c/33039/f/533968/index.rss':'Health',
             'http://timesofindia.feedsportal.com/c/33039/f/533922/index.rss':'Science',
             'http://timesofindia.feedsportal.com/c/33039/f/533925/index.rss':'Environment',
             'http://timesofindia.feedsportal.com/c/33039/f/533923/index.rss':'Technology',
             'http://timesofindia.feedsportal.com/c/33039/f/533924/index.rss':'Education',
             'http://timesofindia.feedsportal.com/c/33039/f/533928/index.rss':'Entertainment',
             'http://timesofindia.indiatimes.com/rssfeeds/2886704.cms':'Lifestyle'
            }


    for key, value in toi_rss.iteritems():
        # print key
        d = feedparser.parse(key)

        category = value
        for post in d.entries:
            try:
                title = post.title

                dated = post.published

                if "photo" in post.link:
                    continue
                if "live" in post.link:
                    continue
                if "videos" in post.link:
                    continue
                if "listshow" in post.link:
                    continue

                html = urlopen(post.link)
                bsObj = BeautifulSoup(html, "html.parser")

                images = bsObj.find("link", attrs={"rel":"image_src"})
                if images is not None:
                    images=images['href']
                story_list=bsObj.find("div", attrs={"class":"content"})
                if story_list is None:
                    story_list=bsObj.find("div", attrs={"class":"Normal"})
                    #print("story was none")
                description=bsObj.find("meta", {'name':'description'})['content']

                #print('title :'+title+"\n")
                # print(post.link)
                # print('category :'+category+"\n")
                # print('description :'+description+"\n")
                # print('full story :'+story_list.get_text()+"\n")
                #
                # print (""+images)
                # print ('pubdate:'+dated)
                # save below variables in db
                save_title=title
                #save_link=post.link
                save_category=category
                save_description=description
                save_full_story=story_list.get_text()
                save_image=images
                save_date=dated
                try:
                    if not db.session.query(Article).filter(Article.title == save_title).count():
                        article_a = Article(title=save_title, full_story=save_full_story, image=save_image, category=save_category,
                        description=save_description, pubdate=save_date)
                        db.session.add(article_a)
                        db.session.commit()
                        print article_a.id
                        entity_extract(article_a.id, save_full_story, 1)

                except Exception as e:#psycopg2.IntegrityError:
                    print"Caught"
                    db.session.rollback()

            except Exception as e:
                print e
Exemple #7
0
            # print image
            # print description
            # print pubdate
            # print full_story

            category = "Firstpost"

            if not db.session.query(Article).filter(
                    Article.title == title).count():
                article_a = Article(title=title,
                                    full_story=full_story,
                                    image=image,
                                    category=category,
                                    description=description,
                                    pubdate=pubdate)
                db.session.add(article_a)
                db.session.commit()
                print article_a.id
                entity_extract(article_a.id, full_story, 1)

        except psycopg2.IntegrityError:  # as ie:
            # print ie
            print "Caught"
            db.session.rollback()
            # break
            # continue

        # print "\n\n"
        except Exception as e:
            print e
            pass
      # category
      category="TechCrunch"


      # print title
      # print image
      # print date
      # print description
      # print html
      # print cleantext
      # print category


      if not db.session.query(Article).filter(Article.title == title).count():
                    article_a = Article(title=title, full_story=cleantext, image=image, category=category,
                                        description=description, pubdate=date, html=html)
                    db.session.add(article_a)
                    db.session.commit()
                    print article_a.id
                    entity_extract(article_a.id, cleantext, 1)
    except psycopg2.IntegrityError:  # as ie:
                # print ie
                print"Caught"
                db.session.rollback()
                # break
                # continue


    except Exception as e:
        print e
        pass
Exemple #9
0
            # print title
            # print image
            # print date
            # print description
            # print html
            # print cleantext
            # print category

            if not db.session.query(Article).filter(
                    Article.title == title).count():
                article_a = Article(title=title,
                                    full_story=cleantext,
                                    image=image,
                                    category=category,
                                    description=description,
                                    pubdate=date,
                                    html=html)
                db.session.add(article_a)
                db.session.commit()
                print article_a.id
                entity_extract(article_a.id, cleantext, 1)
        except psycopg2.IntegrityError:  # as ie:
            # print ie
            print "Caught"
            db.session.rollback()
            # break
            # continue

        except Exception as e:
            print e
            pass
Exemple #10
0
from entity_api import entity_extract
from untitled1 import Article, db, Keyword

# article = Article.query.filter_by(id=92).first()
# print article.category
# article.category = 'Musics'
# db.session.commit()

#data = Article.query.all()
#for article in data:
#    print article.category

#entity_extract(1, "The East India Company in the 1700s conjures pictures of British colonisation. What originally started as trade and business eventually led to 200 years of British Raj for our country.\nAt its peak, the company accounted for about half the global trade specialising in commodities like cotton, indigo, tea and opium, and offering employment to a third of the British workforce.\nAfter the 1857 mutiny in India, all its powers were transferred to the British Crown, and eventually, by 1874, the company was dissolved.\nForwarding into over a century later, in 2005, Mumbai-born entrepreneur, Sanjiv Mehta, with a \u201csense of redemption\u201d, bought the company from its 30 odd owners, and turned it into a luxury food brand.\nThe company now specialises in selling gourmet coffees, chocolates, rare teas, and other luxury-food items through its e-commerce website.\nThe first store was launched in the Mayfair neighborhood in London. Today, the company has escalated commercially and now runs stores across the UK, the Middle East, Europe and Asia, in addition to a successful e-commerce website.\nOn inaugurating the company, Mehta received congratulatory e-mails from thousands of Indians.\n\u201cIt is a dream come true to build a business like this and to acquire a brand like this to own the company,\u201d he said.\u201d\n\nParticipate in this discussion")
# key = Keyword.query.filter_by(id=2).first()
# print key.key_name
for i in range(560, 1290):
    article = Article.query.filter_by(id=i).first()
    if article is None:
        continue
    entity_extract(article.id, article.full_story, 1)
    print "done"
print"all done"
Exemple #11
0
def upload():
    toi_rss = {
        'http://timesofindia.indiatimes.com/rssfeedstopstories.cms':
        'Top stories',
        'http://timesofindia.indiatimes.com/rssfeeds/1221656.cms':
        'Most Recent',
        'http://timesofindia.feedsportal.com/c/33039/f/533916/index.rss':
        'India',
        'http://timesofindia.feedsportal.com/c/33039/f/533917/index.rss':
        'World',
        'http://timesofindia.feedsportal.com/c/33039/f/533919/index.rss':
        'Business',
        'http://timesofindia.feedsportal.com/c/33039/f/533920/index.rss':
        'Cricket',
        'http://timesofindia.feedsportal.com/c/33039/f/533921/index.rss':
        'Sports',
        'http://dynamic.feedsportal.com/c/33039/f/533968/index.rss': 'Health',
        'http://timesofindia.feedsportal.com/c/33039/f/533922/index.rss':
        'Science',
        'http://timesofindia.feedsportal.com/c/33039/f/533925/index.rss':
        'Environment',
        'http://timesofindia.feedsportal.com/c/33039/f/533923/index.rss':
        'Technology',
        'http://timesofindia.feedsportal.com/c/33039/f/533924/index.rss':
        'Education',
        'http://timesofindia.feedsportal.com/c/33039/f/533928/index.rss':
        'Entertainment',
        'http://timesofindia.indiatimes.com/rssfeeds/2886704.cms': 'Lifestyle'
    }

    for key, value in toi_rss.iteritems():
        # print key
        d = feedparser.parse(key)

        category = value
        for post in d.entries:
            try:
                title = post.title

                dated = post.published

                if "photo" in post.link:
                    continue
                if "live" in post.link:
                    continue
                if "videos" in post.link:
                    continue
                if "listshow" in post.link:
                    continue

                html = urlopen(post.link)
                bsObj = BeautifulSoup(html, "html.parser")

                images = bsObj.find("link", attrs={"rel": "image_src"})
                if images is not None:
                    images = images['href']
                story_list = bsObj.find("div", attrs={"class": "content"})
                if story_list is None:
                    story_list = bsObj.find("div", attrs={"class": "Normal"})
                    #print("story was none")
                description = bsObj.find("meta",
                                         {'name': 'description'})['content']

                #print('title :'+title+"\n")
                # print(post.link)
                # print('category :'+category+"\n")
                # print('description :'+description+"\n")
                # print('full story :'+story_list.get_text()+"\n")
                #
                # print (""+images)
                # print ('pubdate:'+dated)
                # save below variables in db
                save_title = title
                #save_link=post.link
                save_category = category
                save_description = description
                save_full_story = story_list.get_text()
                save_image = images
                save_date = dated
                try:
                    if not db.session.query(Article).filter(
                            Article.title == save_title).count():
                        article_a = Article(title=save_title,
                                            full_story=save_full_story,
                                            image=save_image,
                                            category=save_category,
                                            description=save_description,
                                            pubdate=save_date)
                        db.session.add(article_a)
                        db.session.commit()
                        print article_a.id
                        entity_extract(article_a.id, save_full_story, 1)

                except Exception as e:  #psycopg2.IntegrityError:
                    print "Caught"
                    db.session.rollback()

            except Exception as e:
                print e