예제 #1
0
def splitFiles(type):
    # type = 'head'
    if type == 'head':
        sql = 'select id, npages from resolved_papers where downloaded = 1 and npages >= 5 and pdf2text = 1 and english = 1 and %s = 1 and id in (1702);' % (
            type)
    else:
        sql = 'select id, npages from resolved_papers where downloaded = 1 and npages >= 5 and pdf2text = 1 and english = 1 and head = 1 and %s = 1 and id in (1702);' % (
            type)
    print(sql)
    papers = pd.read_sql(sql, con=db)

    for index, row in papers.iterrows():
        #     row = [30092, 9,]
        if type == 'head':
            # print ('entra')
            res = _getHead(row['id'], row['npages'])
        elif type == 'tail':
            res = _getTail(row['id'], row['npages'])

        if res:
            sql = "update resolved_papers set %s = 1 where id = %s" % (type,
                                                                       row[0])
            try:
                cur.execute(sql)
                db.commit()
            except:
                db.rollback()
            print("Id: %s. %s: %s" % (row[0], type.title(), res))
    cur.close()
예제 #2
0
def prepareTail():
    sql = 'select id from resolved_papers where downloaded = 1 and npages >= 5 and pdf2text = 1 and english = 1 and head = 1 and tail = 1 and id in (1702);'
    papers = pd.read_sql(sql, con=db)

    for index, row in papers.iterrows():
        res = _prepareTail(row[0])

        print("Id: %s. Tail prepared: %s" % (row[0], res))
    cur.close()
예제 #3
0
def _countOccurencies(papers):
    # keep this connection in order to use multiprocessing
    db = pymysql.connect(host="dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com",  # your host, usually localhost
                         user="******",  # your username
                         passwd="iwJx0EAM",  # your password
                         db="clpd")  # name of the data base

    cur = db.cursor()

    try:
        id, title = papers

        # title = 'CLEU‐A Cross‐Language English‐Urdu Corpus and Benchmark for Text Reuse Experiments'

        keywords = ["Cross-language".lower().strip(),
                    "Crosslanguage".lower().strip(),
                    "Cross-lingual".lower().strip(),
                    "Crosslingual".lower().strip(),
                    "Cross-linguistic".lower().strip(),
                    "Crosslinguistic".lower().strip(),
                    "Multi-language".lower().strip(),
                    "Multilanguage".lower().strip(),
                    "Multi-lingual".lower().strip(),
                    "Multilingual".lower().strip(),
                    "Multi-linguistic".lower().strip(),
                    "Multilinguistic".lower().strip(),
                    "Machine-translation".lower().strip(),
                    "Copy".lower().strip(),
                    "Duplicate".lower().strip(),
                    "Plagiarism".lower().strip(),
                    "Detection".lower().strip(),
                    "Discovery".lower().strip()]
        nkeywords = len(keywords)
        text = _processText(title)
        words = _processNL(text)
        fdist = nltk.FreqDist(words)


        i = 0
        while i < nkeywords:

            if fdist[str(keywords[i]).lower()] > 0:
                sql = "insert into resolved_papers_title_occurrencies values (%s, '%s', %s);" % (
                    id, str(keywords[i]).lower(), fdist[str(keywords[i]).lower()])
                # print (sql)
                try:
                    cur.execute(sql)
                    db.commit()
                    print('saved')
                except:
                    db.rollback()
            i += 1
    except:
        db.rollback()
        print('no saved')
    cur.close()
예제 #4
0
def classifyPub():
    sql = 'select id, npages from resolved_papers where downloaded = 1 and npages >= 5 and pdf2text = 1 and english = 1 and head = 1 and tail = 1 and type is NULL;'
    papers = pd.read_sql(sql, con=db)

    for index, row in papers.iterrows():
        res = _classifyPub(row[0], row[1])

        if res:
            sql = "update resolved_papers set type = '%s' where id = %s" % (
                res, row[0])
            print(sql)
            try:
                cur.execute(sql)
                db.commit()
            except:
                db.rollback()
            print("Id: %s. type: %s" % (row[0], res))
    cur.close()
예제 #5
0
def updateNumPages():
    sql = 'select id from resolved_papers where downloaded = 1 and npages is NULL;'
    papers = pd.read_sql(sql, con=db)

    for index, row in papers.iterrows():
        # print (row['id'])
        pages = _getNPages(row['id'])
        # print ((row['id'], pages))
        if pages:

            sql = "update resolved_papers set npages = %s where id = %s" % (
                pages, row[0])
            try:
                cur.execute(sql)
                db.commit()
            except:
                db.rollback()
            print("Id: %s. Num Pages: %s" % (row[0], pages))
    cur.close()
예제 #6
0
def _titlesLang(ids):
    # keep this connection in order to use multiprocessing
    db = pymysql.connect(
        host=
        "dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com",  # your host, usually localhost
        user="******",  # your username
        passwd="iwJx0EAM",  # your password
        db="clpd")  # name of the data base

    cur = db.cursor()

    try:
        id, title = ids
        lang = _checkTitle(title)
        sql = "insert into resolved_papers_title values (%s, '%s');" % (id,
                                                                        lang)
        print(sql)
        cur.execute(sql)
        db.commit()
        print("saved")
    except:
        db.rollback()
        print('no saved')
    cur.close()
예제 #7
0
def downloadPDFIEEE(ids):
    # keep this connection in order to use multiprocessing
    db = pymysql.connect(host="dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com",  # your host, usually localhost
                         user="******",  # your username
                         passwd="iwJx0EAM",  # your password
                         db="clpd")  # name of the data base

    cur = db.cursor()

    try:
        i, res_title, main_link, direct_link = ids


        p = False
        downloaded = "False"
        count = 0



        # to get the title language
        # sql = 'select title_language from resolved_papers_title where id = %s;' % (i)
        # cur.execute(sql)
        # res_title = cur.fetchall()[0][0]


        if res_title and res_title == "en":  # or not res_title:

            print(i)
            print(main_link)
            print(direct_link)

            # url = _getUrl(i, "direct_link")
            # toParse = direct_link
            # paper_id = (re.findall('\d+', toParse))[0]
            url = direct_link

            while downloaded == "False" and count < 2:

                count += 1
                if count == 2:
                    file = requests.get(url)
                    open('/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/%s.pdf', 'wb').write(file.content) % (i)
                    p = True
                else:
                    if url:
                        s = DownloadPDF()
                        p = s.download(url,
                                       destination='/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/',
                                       path=str(i) + '.pdf')
                if p == True:
                    downloaded = "True"
                else:
                    # url = _getUrl(i, "main_link")
                    toParse = main_link
                    paper_id = (re.findall('\d+', toParse))[0]
                    # url = 'http://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&isnumber=&arnumber=%s' % (paper_id)
                    url = 'https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=%s' % (paper_id)

            if downloaded == "True":
                # sql = "update resolved_papers2019_unique set downloaded = 1 where id = %s" % (i)
                sql = "update resolved_papers set downloaded = 1 where id = %s" % (i)

                try:
                    cur.execute(sql)
                    db.commit()
                    print("Id: %s. Downloaded: %s. Saved!" % (i, downloaded))
                except:
                    db.rollback()
            else: print("Id: %s. Downloaded: %s." % (i, downloaded))

    except UnicodeDecodeError:
        pass


    cur.close()
예제 #8
0
def downloadPDF(ids):
    # keep this connection in order to use multiprocessing
    db = pymysql.connect(host="dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com",  # your host, usually localhost
                         user="******",  # your username
                         passwd="iwJx0EAM",  # your password
                         db="clpd")  # name of the data base

    cur = db.cursor()

    try:
        i, res_title, main_link, direct_link = ids


        p = False
        downloaded = "False"
        count = 0



        # to get the title language
        # sql = 'select title_language from resolved_papers_title where id = %s;' % (i)
        # cur.execute(sql)
        # res_title = cur.fetchall()[0][0]


        if res_title and res_title == "en":  # or not res_title:

            print(i)
            print(main_link)
            print(direct_link)

            # url = _getUrl(i, "direct_link")
            url = direct_link

            while downloaded == "False" and count < 2:
                count += 1
                if url:
                    s = DownloadPDF()
                    p = s.download(url,
                                   destination='/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/',
                                   path=str(i) + '.pdf')
                if p == True:
                    downloaded = "True"
                else:
                    # url = _getUrl(i, "main_link")
                    url = main_link

            if downloaded == "True":
                # sql = "update resolved_papers2019_unique set downloaded = 1 where id = %s" % (i)
                sql = "update resolved_papers set downloaded = 1 where id = %s" % (i)

                try:
                    cur.execute(sql)
                    db.commit()
                    print("Id: %s. Downloaded: %s. Saved!" % (i, downloaded))
                except:
                    db.rollback()
            else: print("Id: %s. Downloaded: %s." % (i, downloaded))

    except UnicodeDecodeError:
        pass


    cur.close()
예제 #9
0
def _filterTitle(papers):
    # keep this connection in order to use multiprocessing
    db = pymysql.connect(host="dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com",  # your host, usually localhost
                         user="******",  # your username
                         passwd="iwJx0EAM",  # your password
                         db="clpd")  # name of the data base

    cur = db.cursor()

    try:

        id, title = papers

        threshold = 1

        # title = 'A New Approach for Cross-Language Plagiarism Analysis.'.lower()
        title = title.lower()


        k_dflanguage = 0
        k_copy = 0
        k_detection = 0

        diff_language = ["Cross-language",
                         "Crosslanguage",
                         "Cross-lingual",
                         "Crosslingual",
                         "Cross-linguistic",
                         "Crosslinguistic",
                         "Multi-language",
                         "Multilanguage",
                         "Multi-lingual",
                         "Multilingual",
                         "Multi-linguistic",
                         "Multilinguistic",
                         "Machine-translation", ]

        copy = ["Copy",
                "Duplicate",
                "Plagiarism", ]

        detection = ["Detection",
                     "Discovery", ]

        for row in diff_language:
            if row.lower() in title:
                k_dflanguage += 1

        for row in copy:
            if row.lower() in title:
                k_copy += 1

        for row in detection:
            if row.lower() in title:
                k_detection += 1

        print("diff_language: %s." % (k_dflanguage))
        print("copy: %s." % (k_copy))
        print("detection: %s." % (k_detection))

        if (k_dflanguage >= threshold or k_detection >= threshold or k_detection >= threshold):
            # papers_selected.append(id, title)

            sql = "insert into resolved_papers_selected_title values (%s)" % (id)
            print(sql)
            # try:
            cur.execute(sql)
            db.commit()
            # except:
            db.rollback()

            return True
        else:
            return False
    except:
        db.rollback()
        print('no saved')
    cur.close()
def downloadPDF(ids):
    # keep this connection in order to use multiprocessing
    db = pymysql.connect(
        host=
        "dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com",  # your host, usually localhost
        user="******",  # your username
        passwd="iwJx0EAM",  # your password
        db="clpd")  # name of the data base

    cur = db.cursor()

    try:
        i, main_link, direct_link = ids

        p = False
        downloaded = "False"
        count = 0

        print(i)
        print(main_link)
        print(direct_link)

        # url = _getUrl(i, "direct_link")
        url = direct_link

        while downloaded == "False" and count < 2:
            count += 1
            if url:
                # url = 'https://s3.amazonaws.com/academia.edu/download/30761819/book.pdf?response-content-disposition=inline%3B%20filename%3DUsing_monolingual_clickthrough_data_to_b.pdf&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWOWYYGZ2Y53UL3A%2F20190908%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20190908T222002Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=eee90437409f359612d0a47e04739fb0733d3eb347c3d6e4145596986966b26a#page=32'
                # https://s3.amazonaws.com/academia.edu.documents/30761819/book.pdf?response-content-disposition=inline%3B%20filename%3DUsing_monolingual_clickthrough_data_to_b.pdf&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWOWYYGZ2Y53UL3A%2F20190908%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20190908T222002Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=eee90437409f359612d0a47e04739fb0733d3eb347c3d6e4145596986966b26a#page=32
                url = "http://www.academia.edu/download/30761819/book.pdf#page=32"
                # http://www.academia.edu/download/30761819/book.pdf#page=32
                # url = 'http://google.com'
                i = 149
                destination = '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/'
                path = destination + str(i) + '.pdf'

                try:

                    ua = UserAgent()
                    headers = {'User-Agent': str(ua.random)}

                    r = requests.head(
                        'http://www.academia.edu/download/30761819/book.pdf#page=32',
                        allow_redirects=True)
                    print(r.url)

                    s = requests.session()

                    res = s.get(url, headers=headers, allow_redirects=False)
                    print(res.url)
                    # print(finalurl)

                    p = urlretrieve(url, path)

                    if p[1].get_content_type() == 'application/pdf':
                        downloaded = "True"
                except:
                    pass
            else:
                # url = _getUrl(i, "main_link")
                url = main_link

        if downloaded == "True":
            # sql = "update resolved_papers2019_unique set downloaded = 1 where id = %s" % (i)
            sql = "update resolved_papers set downloaded = 1 where id = %s" % (
                i)

            try:
                cur.execute(sql)
                db.commit()
                print("Id: %s. Downloaded: %s. Saved!" % (i, downloaded))
            except:
                db.rollback()
        else:
            print("Id: %s. Downloaded: %s." % (i, downloaded))

    except UnicodeDecodeError:
        pass

    cur.close()