コード例 #1
0
 def get_medium(self, text):
     if not text:
         text = "unknown"
     if text in MEDIUM_ALIASES.keys():
         return Medium.get_or_create(MEDIUM_ALIASES[text])
     else:
         return Medium.get_or_create(text)
コード例 #2
0
ファイル: bzk_pdf.py プロジェクト: pombredanne/amcat
 def create_medium(self, medium):
     if not medium or len(medium) < 1:
         medium = "unknown"
     if medium in MEDIUM_ALIASES.keys():
         return Medium.get_or_create(MEDIUM_ALIASES[medium])
     else:
         return Medium.get_or_create(medium)
コード例 #3
0
ファイル: bzk_html.py プロジェクト: pombredanne/amcat
 def get_medium(self, text):
     if not text:
         text = "unknown"
     if text in MEDIUM_ALIASES.keys():
         return Medium.get_or_create(MEDIUM_ALIASES[text])
     else:
         return Medium.get_or_create(text)
コード例 #4
0
 def create_medium(self, medium):
     if not medium or len(medium) < 1:
         medium = "unknown"
     if medium in MEDIUM_ALIASES.keys():
         return Medium.get_or_create(MEDIUM_ALIASES[medium])
     else:
         return Medium.get_or_create(medium)
コード例 #5
0
ファイル: bzk_html.py プロジェクト: IanHongruZhang/amcat
 def get_medium(text):
     if not text:
         text = "unknown"
     if text in MEDIUM_ALIASES.keys():
         return MEDIUM_ALIASES[text]
     else:
         return text
コード例 #6
0
ファイル: bzk_html.py プロジェクト: kasperwelbers/amcat
 def create_medium(self, html):
     if not html.text:
         medium = "unknown"
     else:
         medium = html.text
     if medium in MEDIUM_ALIASES.keys():
         return Medium.get_or_create(MEDIUM_ALIASES[medium])
     else:
         return Medium.get_or_create(medium)
コード例 #7
0
ファイル: bzk_html.py プロジェクト: kasperwelbers/amcat
 def create_medium(self, html):
     if not html.text:
         medium = "unknown"
     else:
         medium = html.text
     if medium in MEDIUM_ALIASES.keys():
         return Medium.get_or_create(MEDIUM_ALIASES[medium])
     else:
         return Medium.get_or_create(medium)
コード例 #8
0
    def _scrape_unit(self, _file):
        readlines = _file.readlines()
        file_date_line = [l for l in readlines if l.startswith("Date:")][0]
        file_date = read_date(file_date_line.split("Date:")[1])

        lines = []
        mail_header = []
        for line in readlines:
            if lines:
                lines.append(line.rstrip("\r\n"))
            else:
                mail_header.append(line)
            if line.startswith("1red"):  #actual content starts
                lines.append("")

        article = Article(metastring={'mail_header': "".join(mail_header)})

        while True:  #loop through lines up to and including headline
            line = lines.pop(0)
            if line.isupper():
                article.title = line
                break
            elif line:  #first non-empty line, contains metadata
                data = line.split(", ")
                datestr = data[0]
                if "'" in datestr:
                    split = datestr.split("'")
                    datestr = split[0] + "20" + split[1]
                if "=" in datestr:  # if this is true, the year is not parsable
                    # we take the year the mail was sent, might fail around december
                    datestr = datestr.split("=")[0] + str(file_date.year)
                    article.date = read_date(datestr)
                    if (
                            article.date - file_date
                    ).days > 200:  #likely a misparse, with the mail being sent the next year
                        article.date -= timedelta(years=1)
                else:
                    article.date = read_date(datestr)
                if data[2] in BZK_ALIASES.keys():
                    medium_str = BZK_ALIASES[data[1]]
                else:
                    medium_str = data[2]
                article.set_property("medium", medium_str)
                article.set_property("section", data[1])

        paragraphs = []
        paragraph = ""
        while True:
            line = lines.pop(0).rstrip("=")
            if not line:
                paragraphs.append(paragraph)
                paragraph = ""
            elif line.isupper():  #subheader
                paragraph += line + "\n"
            else:
                paragraph += line
            if not lines:
                break
        paragraphs = [p for p in paragraphs if p]

        article.text = ""
        for p in paragraphs:
            article.text += p + "\n\n"
            if p.startswith("(") and len(
                    p.split(",")) > 1:  #laatste regel van normale content
                break

        # Add non-ascii characters
        # Takes the '=AB' occurrences and turns them into latin-1 characters.
        def character(match):
            code = match.group()[1:]
            char = r"\x{}".format(code).decode('string-escape').decode(
                'latin-1')
            if code == "92":
                return "'"
            elif code == "85":
                return "..."
            return char

        article.text = re.sub("=[A-Z0-9]{2}", character, article.text)

        yield article
コード例 #9
0
 def get_medium(self, medium):
     if not medium or len(medium) < 1:
         medium = "unknown"
     return MEDIUM_ALIASES.get(medium, medium)
コード例 #10
0
from amcat.models.medium import Medium
from amcat.models.article import Article
from amcat.scripts.article_upload.bzk_aliases import BZK_ALIASES as aliases

for alias, medium in aliases.items():
    if alias != medium:
        print(alias, " > ", medium)
        #change all articles in project 29
        alias = Medium.get_or_create(alias)
        articles = Article.objects.filter(medium=alias.id, project_id=29)
        print("{} articles".format(articles.count()))
        articles.update(medium=Medium.get_or_create(medium).id)
        #if medium is now empty, delete
        if Article.objects.filter(medium=alias.id).count() == 0:
            print('deleting...')
            alias.delete()
    else:
        print('alias is no alias')
コード例 #11
0
ファイル: tmp.py プロジェクト: pombredanne/amcat
#inverting bzk aliases dict
from amcat.scripts.article_upload.bzk_aliases import BZK_ALIASES

if __name__ == '__main__':

    new_dict = {}
    for entry in BZK_ALIASES.items():
        for alias in entry[1]:
            new_dict[alias] = entry[0]
    print(new_dict)

    # WVA: WAAROM STAAT DIT HIER? IS DIT NIET HETZELFDE ALS HET SCRIPT IN MAINTENANCE/TMP?
    # ALS DAT ZO IS, GAARNA HG RM'EN!
コード例 #12
0
ファイル: bzk_eml.py プロジェクト: CJStuart/amcat
    def _scrape_unit(self, _file):
        readlines = _file.readlines()
        file_date_line = [l for l in readlines if l.startswith("Date:")][0]
        file_date = readDate(file_date_line.split("Date:")[1])

        lines = []
        mail_header = []
        for line in readlines:
            if lines:
                lines.append(line.rstrip("\r\n"))
            else:
                mail_header.append(line)
            if line.startswith("1red"):  #actual content starts
                lines.append("")

        article = Article(metastring={'mail_header': "".join(mail_header)})

        while True:  #loop through lines up to and including headline
            line = lines.pop(0)
            if line.isupper():  #headline
                article.headline = line
                break
            elif line:  #first non-empty line, contains metadata
                data = line.split(", ")
                datestr = data[0]
                if "'" in datestr:
                    split = datestr.split("'")
                    datestr = split[0] + "20" + split[1]
                if "=" in datestr:  # if this is true, the year is not parsable
                    # we take the year the mail was sent, might fail around december
                    datestr = datestr.split("=")[0] + str(file_date.year)
                    article.date = readDate(datestr)
                    if (
                                article.date - file_date).days > 200:  #likely a misparse, with the mail being sent the next year
                        article.date -= timedelta(years=1)
                else:
                    article.date = readDate(datestr)
                if data[2] in BZK_ALIASES.keys():
                    medium_str = BZK_ALIASES[data[1]]
                else:
                    medium_str = data[2]
                article.medium = Medium.get_or_create(medium_str)
                article.section = data[1]

        paragraphs = []
        paragraph = ""
        while True:
            line = lines.pop(0).rstrip("=")
            if not line:
                paragraphs.append(paragraph)
                paragraph = ""
            elif line.isupper():  #subheader
                paragraph += line + "\n"
            else:
                paragraph += line
            if not lines:
                break
        paragraphs = [p for p in paragraphs if p]

        article.text = ""
        for p in paragraphs:
            article.text += p + "\n\n"
            if p.startswith("(") and len(p.split(",")) > 1:  #laatste regel van normale content
                break

        # Add non-ascii characters
        # Takes the '=AB' occurrences and turns them into latin-1 characters.
        def character(match):
            code = match.group()[1:]
            char = r"\x{}".format(code).decode('string-escape').decode('latin-1')
            if code == "92":
                return "'"
            elif code == "85":
                return "..."
            return char

        article.text = re.sub(
            "=[A-Z0-9]{2}",
            character,
            article.text)

        yield article
コード例 #13
0
from amcat.models.medium import Medium
from amcat.models.article import Article
from amcat.scripts.article_upload.bzk_aliases import BZK_ALIASES as aliases

for alias, medium in aliases.items():
    if alias != medium:
        print(alias, " > ", medium)
    #change all articles in project 29
        alias = Medium.get_or_create(alias)
        articles = Article.objects.filter(medium = alias.id, project_id = 29)
        print("{} articles".format(articles.count()))
        articles.update(medium = Medium.get_or_create(medium).id)
    #if medium is now empty, delete
        if Article.objects.filter(medium = alias.id).count() == 0:
            print('deleting...')
            alias.delete()
    else:
        print('alias is no alias')